Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

+1 -1

Documentation/virt/kvm/api.rst

··· 1419 1419 S390: 1420 1420 ^^^^^ 1421 1421 1422 - Returns -EINVAL if the VM has the KVM_VM_S390_UCONTROL flag set. 1422 + Returns -EINVAL or -EEXIST if the VM has the KVM_VM_S390_UCONTROL flag set. 1423 1423 Returns -EINVAL if called on a protected VM. 1424 1424 1425 1425 4.36 KVM_SET_TSS_ADDR

+11 -38

arch/arm64/kvm/arch_timer.c

··· 471 471 472 472 trace_kvm_timer_emulate(ctx, should_fire); 473 473 474 - if (should_fire != ctx->irq.level) { 474 + if (should_fire != ctx->irq.level) 475 475 kvm_timer_update_irq(ctx->vcpu, should_fire, ctx); 476 - return; 477 - } 478 476 479 477 kvm_timer_update_status(ctx, should_fire); 480 478 ··· 759 761 timer_irq(map->direct_ptimer), 760 762 &arch_timer_irq_ops); 761 763 WARN_ON_ONCE(ret); 762 - 763 - /* 764 - * The virtual offset behaviour is "interesting", as it 765 - * always applies when HCR_EL2.E2H==0, but only when 766 - * accessed from EL1 when HCR_EL2.E2H==1. So make sure we 767 - * track E2H when putting the HV timer in "direct" mode. 768 - */ 769 - if (map->direct_vtimer == vcpu_hvtimer(vcpu)) { 770 - struct arch_timer_offset *offs = &map->direct_vtimer->offset; 771 - 772 - if (vcpu_el2_e2h_is_set(vcpu)) 773 - offs->vcpu_offset = NULL; 774 - else 775 - offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2); 776 - } 777 764 } 778 765 } 779 766 ··· 959 976 * which allows trapping of the timer registers even with NV2. 960 977 * Still, this is still worse than FEAT_NV on its own. Meh. 961 978 */ 962 - if (!vcpu_el2_e2h_is_set(vcpu)) { 963 - if (cpus_have_final_cap(ARM64_HAS_ECV)) 964 - return; 965 - 966 - /* 967 - * A non-VHE guest hypervisor doesn't have any direct access 968 - * to its timers: the EL2 registers trap (and the HW is 969 - * fully emulated), while the EL0 registers access memory 970 - * despite the access being notionally direct. Boo. 971 - * 972 - * We update the hardware timer registers with the 973 - * latest value written by the guest to the VNCR page 974 - * and let the hardware take care of the rest. 975 - */ 976 - write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTV_CTL_EL0), SYS_CNTV_CTL); 977 - write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTV_CVAL_EL0), SYS_CNTV_CVAL); 978 - write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTP_CTL_EL0), SYS_CNTP_CTL); 979 - write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTP_CVAL_EL0), SYS_CNTP_CVAL); 980 - } else { 979 + if (!cpus_have_final_cap(ARM64_HAS_ECV)) { 981 980 /* 982 981 * For a VHE guest hypervisor, the EL2 state is directly 983 - * stored in the host EL1 timers, while the emulated EL0 982 + * stored in the host EL1 timers, while the emulated EL1 984 983 * state is stored in the VNCR page. The latter could have 985 984 * been updated behind our back, and we must reset the 986 985 * emulation of the timers. 986 + * 987 + * A non-VHE guest hypervisor doesn't have any direct access 988 + * to its timers: the EL2 registers trap despite being 989 + * notionally direct (we use the EL1 HW, as for VHE), while 990 + * the EL1 registers access memory. 991 + * 992 + * In both cases, process the emulated timers on each guest 993 + * exit. Boo. 987 994 */ 988 995 struct timer_map map; 989 996 get_timer_map(vcpu, &map);

+20

arch/arm64/kvm/arm.c

··· 2290 2290 break; 2291 2291 case -ENODEV: 2292 2292 case -ENXIO: 2293 + /* 2294 + * No VGIC? No pKVM for you. 2295 + * 2296 + * Protected mode assumes that VGICv3 is present, so no point 2297 + * in trying to hobble along if vgic initialization fails. 2298 + */ 2299 + if (is_protected_kvm_enabled()) 2300 + goto out; 2301 + 2302 + /* 2303 + * Otherwise, userspace could choose to implement a GIC for its 2304 + * guest on non-cooperative hardware. 2305 + */ 2293 2306 vgic_present = false; 2294 2307 err = 0; 2295 2308 break; ··· 2413 2400 kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1); 2414 2401 kvm_nvhe_sym(__icache_flags) = __icache_flags; 2415 2402 kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits; 2403 + 2404 + /* 2405 + * Flush entire BSS since part of its data containing init symbols is read 2406 + * while the MMU is off. 2407 + */ 2408 + kvm_flush_dcache_to_poc(kvm_ksym_ref(__hyp_bss_start), 2409 + kvm_ksym_ref(__hyp_bss_end) - kvm_ksym_ref(__hyp_bss_start)); 2416 2410 } 2417 2411 2418 2412 static int __init kvm_hyp_init_protection(u32 hyp_va_bits)

+24

arch/arm64/kvm/hyp/nvhe/hyp-main.c

··· 91 91 *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; 92 92 } 93 93 94 + static void flush_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) 95 + { 96 + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; 97 + 98 + hyp_vcpu->vcpu.arch.debug_owner = host_vcpu->arch.debug_owner; 99 + 100 + if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu)) 101 + hyp_vcpu->vcpu.arch.vcpu_debug_state = host_vcpu->arch.vcpu_debug_state; 102 + else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu)) 103 + hyp_vcpu->vcpu.arch.external_debug_state = host_vcpu->arch.external_debug_state; 104 + } 105 + 106 + static void sync_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) 107 + { 108 + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; 109 + 110 + if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu)) 111 + host_vcpu->arch.vcpu_debug_state = hyp_vcpu->vcpu.arch.vcpu_debug_state; 112 + else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu)) 113 + host_vcpu->arch.external_debug_state = hyp_vcpu->vcpu.arch.external_debug_state; 114 + } 115 + 94 116 static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) 95 117 { 96 118 struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; 97 119 98 120 fpsimd_sve_flush(); 121 + flush_debug_state(hyp_vcpu); 99 122 100 123 hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt; 101 124 ··· 146 123 unsigned int i; 147 124 148 125 fpsimd_sve_sync(&hyp_vcpu->vcpu); 126 + sync_debug_state(hyp_vcpu); 149 127 150 128 host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt; 151 129

+5 -4

arch/arm64/kvm/nested.c

··· 67 67 if (!tmp) 68 68 return -ENOMEM; 69 69 70 + swap(kvm->arch.nested_mmus, tmp); 71 + 70 72 /* 71 73 * If we went through a realocation, adjust the MMU back-pointers in 72 74 * the previously initialised kvm_pgtable structures. 73 75 */ 74 76 if (kvm->arch.nested_mmus != tmp) 75 77 for (int i = 0; i < kvm->arch.nested_mmus_size; i++) 76 - tmp[i].pgt->mmu = &tmp[i]; 78 + kvm->arch.nested_mmus[i].pgt->mmu = &kvm->arch.nested_mmus[i]; 77 79 78 80 for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++) 79 - ret = init_nested_s2_mmu(kvm, &tmp[i]); 81 + ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]); 80 82 81 83 if (ret) { 82 84 for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++) 83 - kvm_free_stage2_pgd(&tmp[i]); 85 + kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]); 84 86 85 87 return ret; 86 88 } 87 89 88 90 kvm->arch.nested_mmus_size = num_mmus; 89 - kvm->arch.nested_mmus = tmp; 90 91 91 92 return 0; 92 93 }

+13 -3

arch/arm64/kvm/sys_regs.c

··· 1452 1452 return true; 1453 1453 } 1454 1454 1455 + static bool access_hv_timer(struct kvm_vcpu *vcpu, 1456 + struct sys_reg_params *p, 1457 + const struct sys_reg_desc *r) 1458 + { 1459 + if (!vcpu_el2_e2h_is_set(vcpu)) 1460 + return undef_access(vcpu, p, r); 1461 + 1462 + return access_arch_timer(vcpu, p, r); 1463 + } 1464 + 1455 1465 static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp, 1456 1466 s64 new, s64 cur) 1457 1467 { ··· 3113 3103 EL2_REG(CNTHP_CTL_EL2, access_arch_timer, reset_val, 0), 3114 3104 EL2_REG(CNTHP_CVAL_EL2, access_arch_timer, reset_val, 0), 3115 3105 3116 - { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_arch_timer }, 3117 - EL2_REG(CNTHV_CTL_EL2, access_arch_timer, reset_val, 0), 3118 - EL2_REG(CNTHV_CVAL_EL2, access_arch_timer, reset_val, 0), 3106 + { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_hv_timer }, 3107 + EL2_REG(CNTHV_CTL_EL2, access_hv_timer, reset_val, 0), 3108 + EL2_REG(CNTHV_CVAL_EL2, access_hv_timer, reset_val, 0), 3119 3109 3120 3110 { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 }, 3121 3111

+6 -14

arch/s390/include/asm/gmap.h

··· 23 23 /** 24 24 * struct gmap_struct - guest address space 25 25 * @list: list head for the mm->context gmap list 26 - * @crst_list: list of all crst tables used in the guest address space 27 26 * @mm: pointer to the parent mm_struct 28 27 * @guest_to_host: radix tree with guest to host address translation 29 28 * @host_to_guest: radix tree with pointer to segment table entries ··· 34 35 * @guest_handle: protected virtual machine handle for the ultravisor 35 36 * @host_to_rmap: radix tree with gmap_rmap lists 36 37 * @children: list of shadow gmap structures 37 - * @pt_list: list of all page tables used in the shadow guest address space 38 38 * @shadow_lock: spinlock to protect the shadow gmap list 39 39 * @parent: pointer to the parent gmap for shadow guest address spaces 40 40 * @orig_asce: ASCE for which the shadow page table has been created ··· 43 45 */ 44 46 struct gmap { 45 47 struct list_head list; 46 - struct list_head crst_list; 47 48 struct mm_struct *mm; 48 49 struct radix_tree_root guest_to_host; 49 50 struct radix_tree_root host_to_guest; ··· 58 61 /* Additional data for shadow guest address spaces */ 59 62 struct radix_tree_root host_to_rmap; 60 63 struct list_head children; 61 - struct list_head pt_list; 62 64 spinlock_t shadow_lock; 63 65 struct gmap *parent; 64 66 unsigned long orig_asce; ··· 102 106 void gmap_remove(struct gmap *gmap); 103 107 struct gmap *gmap_get(struct gmap *gmap); 104 108 void gmap_put(struct gmap *gmap); 109 + void gmap_free(struct gmap *gmap); 110 + struct gmap *gmap_alloc(unsigned long limit); 105 111 106 112 int gmap_map_segment(struct gmap *gmap, unsigned long from, 107 113 unsigned long to, unsigned long len); 108 114 int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); 109 115 unsigned long __gmap_translate(struct gmap *, unsigned long gaddr); 110 - unsigned long gmap_translate(struct gmap *, unsigned long gaddr); 111 116 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr); 112 - int gmap_fault(struct gmap *, unsigned long gaddr, unsigned int fault_flags); 113 117 void gmap_discard(struct gmap *, unsigned long from, unsigned long to); 114 118 void __gmap_zap(struct gmap *, unsigned long gaddr); 115 119 void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr); 116 120 117 121 int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val); 118 122 119 - struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, 120 - int edat_level); 121 - int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level); 123 + void gmap_unshadow(struct gmap *sg); 122 124 int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, 123 125 int fake); 124 126 int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, ··· 125 131 int fake); 126 132 int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, 127 133 int fake); 128 - int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, 129 - unsigned long *pgt, int *dat_protection, int *fake); 130 134 int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte); 131 135 132 136 void gmap_register_pte_notifier(struct gmap_notifier *); 133 137 void gmap_unregister_pte_notifier(struct gmap_notifier *); 134 138 135 - int gmap_mprotect_notify(struct gmap *, unsigned long start, 136 - unsigned long len, int prot); 139 + int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits); 137 140 138 141 void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4], 139 142 unsigned long gaddr, unsigned long vmaddr); 140 143 int s390_disable_cow_sharing(void); 141 - void s390_unlist_old_asce(struct gmap *gmap); 142 144 int s390_replace_asce(struct gmap *gmap); 143 145 void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns); 144 146 int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, 145 147 unsigned long end, bool interruptible); 148 + int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split); 149 + unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level); 146 150 147 151 /** 148 152 * s390_uv_destroy_range - Destroy a range of pages in the given mm.

+5 -1

arch/s390/include/asm/kvm_host.h

··· 30 30 #define KVM_S390_ESCA_CPU_SLOTS 248 31 31 #define KVM_MAX_VCPUS 255 32 32 33 + #define KVM_INTERNAL_MEM_SLOTS 1 34 + 33 35 /* 34 36 * These seem to be used for allocating ->chip in the routing table, which we 35 37 * don't use. 1 is as small as we can get to reduce the needed memory. If we ··· 933 931 u8 reserved928[0x1000 - 0x928]; /* 0x0928 */ 934 932 }; 935 933 934 + struct vsie_page; 935 + 936 936 struct kvm_s390_vsie { 937 937 struct mutex mutex; 938 938 struct radix_tree_root addr_to_page; 939 939 int page_count; 940 940 int next; 941 - struct page *pages[KVM_MAX_VCPUS]; 941 + struct vsie_page *pages[KVM_MAX_VCPUS]; 942 942 }; 943 943 944 944 struct kvm_s390_gisa_iam {

+18 -3

arch/s390/include/asm/pgtable.h

··· 420 420 #define PGSTE_HC_BIT 0x0020000000000000UL 421 421 #define PGSTE_GR_BIT 0x0004000000000000UL 422 422 #define PGSTE_GC_BIT 0x0002000000000000UL 423 - #define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */ 424 - #define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */ 425 - #define PGSTE_VSIE_BIT 0x0000200000000000UL /* ref'd in a shadow table */ 423 + #define PGSTE_ST2_MASK 0x0000ffff00000000UL 424 + #define PGSTE_UC_BIT 0x0000000000008000UL /* user dirty (migration) */ 425 + #define PGSTE_IN_BIT 0x0000000000004000UL /* IPTE notify bit */ 426 + #define PGSTE_VSIE_BIT 0x0000000000002000UL /* ref'd in a shadow table */ 426 427 427 428 /* Guest Page State used for virtualization */ 428 429 #define _PGSTE_GPS_ZERO 0x0000000080000000UL ··· 2007 2006 2008 2007 #define pmd_pgtable(pmd) \ 2009 2008 ((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE)) 2009 + 2010 + static inline unsigned long gmap_pgste_get_pgt_addr(unsigned long *pgt) 2011 + { 2012 + unsigned long *pgstes, res; 2013 + 2014 + pgstes = pgt + _PAGE_ENTRIES; 2015 + 2016 + res = (pgstes[0] & PGSTE_ST2_MASK) << 16; 2017 + res |= pgstes[1] & PGSTE_ST2_MASK; 2018 + res |= (pgstes[2] & PGSTE_ST2_MASK) >> 16; 2019 + res |= (pgstes[3] & PGSTE_ST2_MASK) >> 32; 2020 + 2021 + return res; 2022 + } 2010 2023 2011 2024 #endif /* _S390_PAGE_H */

+3 -3

arch/s390/include/asm/uv.h

··· 628 628 } 629 629 630 630 int uv_pin_shared(unsigned long paddr); 631 - int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); 632 - int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr); 633 631 int uv_destroy_folio(struct folio *folio); 634 632 int uv_destroy_pte(pte_t pte); 635 633 int uv_convert_from_secure_pte(pte_t pte); 636 - int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr); 634 + int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb); 635 + int uv_convert_from_secure(unsigned long paddr); 636 + int uv_convert_from_secure_folio(struct folio *folio); 637 637 638 638 void setup_uv(void); 639 639

+29 -263

arch/s390/kernel/uv.c

··· 19 19 #include <asm/sections.h> 20 20 #include <asm/uv.h> 21 21 22 - #if !IS_ENABLED(CONFIG_KVM) 23 - unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) 24 - { 25 - return 0; 26 - } 27 - 28 - int gmap_fault(struct gmap *gmap, unsigned long gaddr, 29 - unsigned int fault_flags) 30 - { 31 - return 0; 32 - } 33 - #endif 34 - 35 22 /* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */ 36 23 int __bootdata_preserved(prot_virt_guest); 37 24 EXPORT_SYMBOL(prot_virt_guest); ··· 146 159 folio_put(folio); 147 160 return rc; 148 161 } 162 + EXPORT_SYMBOL(uv_destroy_folio); 149 163 150 164 /* 151 165 * The present PTE still indirectly holds a folio reference through the mapping. ··· 163 175 * 164 176 * @paddr: Absolute host address of page to be exported 165 177 */ 166 - static int uv_convert_from_secure(unsigned long paddr) 178 + int uv_convert_from_secure(unsigned long paddr) 167 179 { 168 180 struct uv_cb_cfs uvcb = { 169 181 .header.cmd = UVC_CMD_CONV_FROM_SEC_STOR, ··· 175 187 return -EINVAL; 176 188 return 0; 177 189 } 190 + EXPORT_SYMBOL_GPL(uv_convert_from_secure); 178 191 179 192 /* 180 193 * The caller must already hold a reference to the folio. 181 194 */ 182 - static int uv_convert_from_secure_folio(struct folio *folio) 195 + int uv_convert_from_secure_folio(struct folio *folio) 183 196 { 184 197 int rc; 185 198 ··· 195 206 folio_put(folio); 196 207 return rc; 197 208 } 209 + EXPORT_SYMBOL_GPL(uv_convert_from_secure_folio); 198 210 199 211 /* 200 212 * The present PTE still indirectly holds a folio reference through the mapping. ··· 227 237 return res; 228 238 } 229 239 230 - static int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) 240 + /** 241 + * make_folio_secure() - make a folio secure 242 + * @folio: the folio to make secure 243 + * @uvcb: the uvcb that describes the UVC to be used 244 + * 245 + * The folio @folio will be made secure if possible, @uvcb will be passed 246 + * as-is to the UVC. 247 + * 248 + * Return: 0 on success; 249 + * -EBUSY if the folio is in writeback or has too many references; 250 + * -E2BIG if the folio is large; 251 + * -EAGAIN if the UVC needs to be attempted again; 252 + * -ENXIO if the address is not mapped; 253 + * -EINVAL if the UVC failed for other reasons. 254 + * 255 + * Context: The caller must hold exactly one extra reference on the folio 256 + * (it's the same logic as split_folio()) 257 + */ 258 + int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) 231 259 { 232 260 int expected, cc = 0; 233 261 262 + if (folio_test_large(folio)) 263 + return -E2BIG; 234 264 if (folio_test_writeback(folio)) 235 - return -EAGAIN; 236 - expected = expected_folio_refs(folio); 265 + return -EBUSY; 266 + expected = expected_folio_refs(folio) + 1; 237 267 if (!folio_ref_freeze(folio, expected)) 238 268 return -EBUSY; 239 269 set_bit(PG_arch_1, &folio->flags); ··· 277 267 return -EAGAIN; 278 268 return uvcb->rc == 0x10a ? -ENXIO : -EINVAL; 279 269 } 280 - 281 - /** 282 - * should_export_before_import - Determine whether an export is needed 283 - * before an import-like operation 284 - * @uvcb: the Ultravisor control block of the UVC to be performed 285 - * @mm: the mm of the process 286 - * 287 - * Returns whether an export is needed before every import-like operation. 288 - * This is needed for shared pages, which don't trigger a secure storage 289 - * exception when accessed from a different guest. 290 - * 291 - * Although considered as one, the Unpin Page UVC is not an actual import, 292 - * so it is not affected. 293 - * 294 - * No export is needed also when there is only one protected VM, because the 295 - * page cannot belong to the wrong VM in that case (there is no "other VM" 296 - * it can belong to). 297 - * 298 - * Return: true if an export is needed before every import, otherwise false. 299 - */ 300 - static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) 301 - { 302 - /* 303 - * The misc feature indicates, among other things, that importing a 304 - * shared page from a different protected VM will automatically also 305 - * transfer its ownership. 306 - */ 307 - if (uv_has_feature(BIT_UV_FEAT_MISC)) 308 - return false; 309 - if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) 310 - return false; 311 - return atomic_read(&mm->context.protected_count) > 1; 312 - } 313 - 314 - /* 315 - * Drain LRU caches: the local one on first invocation and the ones of all 316 - * CPUs on successive invocations. Returns "true" on the first invocation. 317 - */ 318 - static bool drain_lru(bool *drain_lru_called) 319 - { 320 - /* 321 - * If we have tried a local drain and the folio refcount 322 - * still does not match our expected safe value, try with a 323 - * system wide drain. This is needed if the pagevecs holding 324 - * the page are on a different CPU. 325 - */ 326 - if (*drain_lru_called) { 327 - lru_add_drain_all(); 328 - /* We give up here, don't retry immediately. */ 329 - return false; 330 - } 331 - /* 332 - * We are here if the folio refcount does not match the 333 - * expected safe value. The main culprits are usually 334 - * pagevecs. With lru_add_drain() we drain the pagevecs 335 - * on the local CPU so that hopefully the refcount will 336 - * reach the expected safe value. 337 - */ 338 - lru_add_drain(); 339 - *drain_lru_called = true; 340 - /* The caller should try again immediately */ 341 - return true; 342 - } 343 - 344 - /* 345 - * Requests the Ultravisor to make a page accessible to a guest. 346 - * If it's brought in the first time, it will be cleared. If 347 - * it has been exported before, it will be decrypted and integrity 348 - * checked. 349 - */ 350 - int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) 351 - { 352 - struct vm_area_struct *vma; 353 - bool drain_lru_called = false; 354 - spinlock_t *ptelock; 355 - unsigned long uaddr; 356 - struct folio *folio; 357 - pte_t *ptep; 358 - int rc; 359 - 360 - again: 361 - rc = -EFAULT; 362 - mmap_read_lock(gmap->mm); 363 - 364 - uaddr = __gmap_translate(gmap, gaddr); 365 - if (IS_ERR_VALUE(uaddr)) 366 - goto out; 367 - vma = vma_lookup(gmap->mm, uaddr); 368 - if (!vma) 369 - goto out; 370 - /* 371 - * Secure pages cannot be huge and userspace should not combine both. 372 - * In case userspace does it anyway this will result in an -EFAULT for 373 - * the unpack. The guest is thus never reaching secure mode. If 374 - * userspace is playing dirty tricky with mapping huge pages later 375 - * on this will result in a segmentation fault. 376 - */ 377 - if (is_vm_hugetlb_page(vma)) 378 - goto out; 379 - 380 - rc = -ENXIO; 381 - ptep = get_locked_pte(gmap->mm, uaddr, &ptelock); 382 - if (!ptep) 383 - goto out; 384 - if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) { 385 - folio = page_folio(pte_page(*ptep)); 386 - rc = -EAGAIN; 387 - if (folio_test_large(folio)) { 388 - rc = -E2BIG; 389 - } else if (folio_trylock(folio)) { 390 - if (should_export_before_import(uvcb, gmap->mm)) 391 - uv_convert_from_secure(PFN_PHYS(folio_pfn(folio))); 392 - rc = make_folio_secure(folio, uvcb); 393 - folio_unlock(folio); 394 - } 395 - 396 - /* 397 - * Once we drop the PTL, the folio may get unmapped and 398 - * freed immediately. We need a temporary reference. 399 - */ 400 - if (rc == -EAGAIN || rc == -E2BIG) 401 - folio_get(folio); 402 - } 403 - pte_unmap_unlock(ptep, ptelock); 404 - out: 405 - mmap_read_unlock(gmap->mm); 406 - 407 - switch (rc) { 408 - case -E2BIG: 409 - folio_lock(folio); 410 - rc = split_folio(folio); 411 - folio_unlock(folio); 412 - folio_put(folio); 413 - 414 - switch (rc) { 415 - case 0: 416 - /* Splitting succeeded, try again immediately. */ 417 - goto again; 418 - case -EAGAIN: 419 - /* Additional folio references. */ 420 - if (drain_lru(&drain_lru_called)) 421 - goto again; 422 - return -EAGAIN; 423 - case -EBUSY: 424 - /* Unexpected race. */ 425 - return -EAGAIN; 426 - } 427 - WARN_ON_ONCE(1); 428 - return -ENXIO; 429 - case -EAGAIN: 430 - /* 431 - * If we are here because the UVC returned busy or partial 432 - * completion, this is just a useless check, but it is safe. 433 - */ 434 - folio_wait_writeback(folio); 435 - folio_put(folio); 436 - return -EAGAIN; 437 - case -EBUSY: 438 - /* Additional folio references. */ 439 - if (drain_lru(&drain_lru_called)) 440 - goto again; 441 - return -EAGAIN; 442 - case -ENXIO: 443 - if (gmap_fault(gmap, gaddr, FAULT_FLAG_WRITE)) 444 - return -EFAULT; 445 - return -EAGAIN; 446 - } 447 - return rc; 448 - } 449 - EXPORT_SYMBOL_GPL(gmap_make_secure); 450 - 451 - int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr) 452 - { 453 - struct uv_cb_cts uvcb = { 454 - .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, 455 - .header.len = sizeof(uvcb), 456 - .guest_handle = gmap->guest_handle, 457 - .gaddr = gaddr, 458 - }; 459 - 460 - return gmap_make_secure(gmap, gaddr, &uvcb); 461 - } 462 - EXPORT_SYMBOL_GPL(gmap_convert_to_secure); 463 - 464 - /** 465 - * gmap_destroy_page - Destroy a guest page. 466 - * @gmap: the gmap of the guest 467 - * @gaddr: the guest address to destroy 468 - * 469 - * An attempt will be made to destroy the given guest page. If the attempt 470 - * fails, an attempt is made to export the page. If both attempts fail, an 471 - * appropriate error is returned. 472 - */ 473 - int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) 474 - { 475 - struct vm_area_struct *vma; 476 - struct folio_walk fw; 477 - unsigned long uaddr; 478 - struct folio *folio; 479 - int rc; 480 - 481 - rc = -EFAULT; 482 - mmap_read_lock(gmap->mm); 483 - 484 - uaddr = __gmap_translate(gmap, gaddr); 485 - if (IS_ERR_VALUE(uaddr)) 486 - goto out; 487 - vma = vma_lookup(gmap->mm, uaddr); 488 - if (!vma) 489 - goto out; 490 - /* 491 - * Huge pages should not be able to become secure 492 - */ 493 - if (is_vm_hugetlb_page(vma)) 494 - goto out; 495 - 496 - rc = 0; 497 - folio = folio_walk_start(&fw, vma, uaddr, 0); 498 - if (!folio) 499 - goto out; 500 - /* 501 - * See gmap_make_secure(): large folios cannot be secure. Small 502 - * folio implies FW_LEVEL_PTE. 503 - */ 504 - if (folio_test_large(folio) || !pte_write(fw.pte)) 505 - goto out_walk_end; 506 - rc = uv_destroy_folio(folio); 507 - /* 508 - * Fault handlers can race; it is possible that two CPUs will fault 509 - * on the same secure page. One CPU can destroy the page, reboot, 510 - * re-enter secure mode and import it, while the second CPU was 511 - * stuck at the beginning of the handler. At some point the second 512 - * CPU will be able to progress, and it will not be able to destroy 513 - * the page. In that case we do not want to terminate the process, 514 - * we instead try to export the page. 515 - */ 516 - if (rc) 517 - rc = uv_convert_from_secure_folio(folio); 518 - out_walk_end: 519 - folio_walk_end(&fw, vma); 520 - out: 521 - mmap_read_unlock(gmap->mm); 522 - return rc; 523 - } 524 - EXPORT_SYMBOL_GPL(gmap_destroy_page); 270 + EXPORT_SYMBOL_GPL(make_folio_secure); 525 271 526 272 /* 527 273 * To be called with the folio locked or with an extra reference! This will

+1 -1

arch/s390/kvm/Makefile

··· 8 8 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm 9 9 10 10 kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o 11 - kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o 11 + kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap.o gmap-vsie.o 12 12 13 13 kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o 14 14 obj-$(CONFIG_KVM) += kvm.o

+43 -1

arch/s390/kvm/gaccess.c

··· 16 16 #include <asm/gmap.h> 17 17 #include <asm/dat-bits.h> 18 18 #include "kvm-s390.h" 19 + #include "gmap.h" 19 20 #include "gaccess.h" 20 21 21 22 /* ··· 1394 1393 } 1395 1394 1396 1395 /** 1396 + * shadow_pgt_lookup() - find a shadow page table 1397 + * @sg: pointer to the shadow guest address space structure 1398 + * @saddr: the address in the shadow aguest address space 1399 + * @pgt: parent gmap address of the page table to get shadowed 1400 + * @dat_protection: if the pgtable is marked as protected by dat 1401 + * @fake: pgt references contiguous guest memory block, not a pgtable 1402 + * 1403 + * Returns 0 if the shadow page table was found and -EAGAIN if the page 1404 + * table was not found. 1405 + * 1406 + * Called with sg->mm->mmap_lock in read. 1407 + */ 1408 + static int shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long *pgt, 1409 + int *dat_protection, int *fake) 1410 + { 1411 + unsigned long pt_index; 1412 + unsigned long *table; 1413 + struct page *page; 1414 + int rc; 1415 + 1416 + spin_lock(&sg->guest_table_lock); 1417 + table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ 1418 + if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { 1419 + /* Shadow page tables are full pages (pte+pgste) */ 1420 + page = pfn_to_page(*table >> PAGE_SHIFT); 1421 + pt_index = gmap_pgste_get_pgt_addr(page_to_virt(page)); 1422 + *pgt = pt_index & ~GMAP_SHADOW_FAKE_TABLE; 1423 + *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); 1424 + *fake = !!(pt_index & GMAP_SHADOW_FAKE_TABLE); 1425 + rc = 0; 1426 + } else { 1427 + rc = -EAGAIN; 1428 + } 1429 + spin_unlock(&sg->guest_table_lock); 1430 + return rc; 1431 + } 1432 + 1433 + /** 1397 1434 * kvm_s390_shadow_fault - handle fault on a shadow page table 1398 1435 * @vcpu: virtual cpu 1399 1436 * @sg: pointer to the shadow guest address space structure ··· 1454 1415 int dat_protection, fake; 1455 1416 int rc; 1456 1417 1418 + if (KVM_BUG_ON(!gmap_is_shadow(sg), vcpu->kvm)) 1419 + return -EFAULT; 1420 + 1457 1421 mmap_read_lock(sg->mm); 1458 1422 /* 1459 1423 * We don't want any guest-2 tables to change - so the parent ··· 1465 1423 */ 1466 1424 ipte_lock(vcpu->kvm); 1467 1425 1468 - rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); 1426 + rc = shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); 1469 1427 if (rc) 1470 1428 rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection, 1471 1429 &fake);

+142

arch/s390/kvm/gmap-vsie.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Guest memory management for KVM/s390 nested VMs. 4 + * 5 + * Copyright IBM Corp. 2008, 2020, 2024 6 + * 7 + * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> 8 + * Martin Schwidefsky <schwidefsky@de.ibm.com> 9 + * David Hildenbrand <david@redhat.com> 10 + * Janosch Frank <frankja@linux.vnet.ibm.com> 11 + */ 12 + 13 + #include <linux/compiler.h> 14 + #include <linux/kvm.h> 15 + #include <linux/kvm_host.h> 16 + #include <linux/pgtable.h> 17 + #include <linux/pagemap.h> 18 + #include <linux/mman.h> 19 + 20 + #include <asm/lowcore.h> 21 + #include <asm/gmap.h> 22 + #include <asm/uv.h> 23 + 24 + #include "kvm-s390.h" 25 + #include "gmap.h" 26 + 27 + /** 28 + * gmap_find_shadow - find a specific asce in the list of shadow tables 29 + * @parent: pointer to the parent gmap 30 + * @asce: ASCE for which the shadow table is created 31 + * @edat_level: edat level to be used for the shadow translation 32 + * 33 + * Returns the pointer to a gmap if a shadow table with the given asce is 34 + * already available, ERR_PTR(-EAGAIN) if another one is just being created, 35 + * otherwise NULL 36 + * 37 + * Context: Called with parent->shadow_lock held 38 + */ 39 + static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, int edat_level) 40 + { 41 + struct gmap *sg; 42 + 43 + lockdep_assert_held(&parent->shadow_lock); 44 + list_for_each_entry(sg, &parent->children, list) { 45 + if (!gmap_shadow_valid(sg, asce, edat_level)) 46 + continue; 47 + if (!sg->initialized) 48 + return ERR_PTR(-EAGAIN); 49 + refcount_inc(&sg->ref_count); 50 + return sg; 51 + } 52 + return NULL; 53 + } 54 + 55 + /** 56 + * gmap_shadow - create/find a shadow guest address space 57 + * @parent: pointer to the parent gmap 58 + * @asce: ASCE for which the shadow table is created 59 + * @edat_level: edat level to be used for the shadow translation 60 + * 61 + * The pages of the top level page table referred by the asce parameter 62 + * will be set to read-only and marked in the PGSTEs of the kvm process. 63 + * The shadow table will be removed automatically on any change to the 64 + * PTE mapping for the source table. 65 + * 66 + * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, 67 + * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the 68 + * parent gmap table could not be protected. 69 + */ 70 + struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level) 71 + { 72 + struct gmap *sg, *new; 73 + unsigned long limit; 74 + int rc; 75 + 76 + if (KVM_BUG_ON(parent->mm->context.allow_gmap_hpage_1m, (struct kvm *)parent->private) || 77 + KVM_BUG_ON(gmap_is_shadow(parent), (struct kvm *)parent->private)) 78 + return ERR_PTR(-EFAULT); 79 + spin_lock(&parent->shadow_lock); 80 + sg = gmap_find_shadow(parent, asce, edat_level); 81 + spin_unlock(&parent->shadow_lock); 82 + if (sg) 83 + return sg; 84 + /* Create a new shadow gmap */ 85 + limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); 86 + if (asce & _ASCE_REAL_SPACE) 87 + limit = -1UL; 88 + new = gmap_alloc(limit); 89 + if (!new) 90 + return ERR_PTR(-ENOMEM); 91 + new->mm = parent->mm; 92 + new->parent = gmap_get(parent); 93 + new->private = parent->private; 94 + new->orig_asce = asce; 95 + new->edat_level = edat_level; 96 + new->initialized = false; 97 + spin_lock(&parent->shadow_lock); 98 + /* Recheck if another CPU created the same shadow */ 99 + sg = gmap_find_shadow(parent, asce, edat_level); 100 + if (sg) { 101 + spin_unlock(&parent->shadow_lock); 102 + gmap_free(new); 103 + return sg; 104 + } 105 + if (asce & _ASCE_REAL_SPACE) { 106 + /* only allow one real-space gmap shadow */ 107 + list_for_each_entry(sg, &parent->children, list) { 108 + if (sg->orig_asce & _ASCE_REAL_SPACE) { 109 + spin_lock(&sg->guest_table_lock); 110 + gmap_unshadow(sg); 111 + spin_unlock(&sg->guest_table_lock); 112 + list_del(&sg->list); 113 + gmap_put(sg); 114 + break; 115 + } 116 + } 117 + } 118 + refcount_set(&new->ref_count, 2); 119 + list_add(&new->list, &parent->children); 120 + if (asce & _ASCE_REAL_SPACE) { 121 + /* nothing to protect, return right away */ 122 + new->initialized = true; 123 + spin_unlock(&parent->shadow_lock); 124 + return new; 125 + } 126 + spin_unlock(&parent->shadow_lock); 127 + /* protect after insertion, so it will get properly invalidated */ 128 + mmap_read_lock(parent->mm); 129 + rc = __kvm_s390_mprotect_many(parent, asce & _ASCE_ORIGIN, 130 + ((asce & _ASCE_TABLE_LENGTH) + 1), 131 + PROT_READ, GMAP_NOTIFY_SHADOW); 132 + mmap_read_unlock(parent->mm); 133 + spin_lock(&parent->shadow_lock); 134 + new->initialized = true; 135 + if (rc) { 136 + list_del(&new->list); 137 + gmap_free(new); 138 + new = ERR_PTR(rc); 139 + } 140 + spin_unlock(&parent->shadow_lock); 141 + return new; 142 + }

+212

arch/s390/kvm/gmap.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Guest memory management for KVM/s390 4 + * 5 + * Copyright IBM Corp. 2008, 2020, 2024 6 + * 7 + * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> 8 + * Martin Schwidefsky <schwidefsky@de.ibm.com> 9 + * David Hildenbrand <david@redhat.com> 10 + * Janosch Frank <frankja@linux.vnet.ibm.com> 11 + */ 12 + 13 + #include <linux/compiler.h> 14 + #include <linux/kvm.h> 15 + #include <linux/kvm_host.h> 16 + #include <linux/pgtable.h> 17 + #include <linux/pagemap.h> 18 + 19 + #include <asm/lowcore.h> 20 + #include <asm/gmap.h> 21 + #include <asm/uv.h> 22 + 23 + #include "gmap.h" 24 + 25 + /** 26 + * should_export_before_import - Determine whether an export is needed 27 + * before an import-like operation 28 + * @uvcb: the Ultravisor control block of the UVC to be performed 29 + * @mm: the mm of the process 30 + * 31 + * Returns whether an export is needed before every import-like operation. 32 + * This is needed for shared pages, which don't trigger a secure storage 33 + * exception when accessed from a different guest. 34 + * 35 + * Although considered as one, the Unpin Page UVC is not an actual import, 36 + * so it is not affected. 37 + * 38 + * No export is needed also when there is only one protected VM, because the 39 + * page cannot belong to the wrong VM in that case (there is no "other VM" 40 + * it can belong to). 41 + * 42 + * Return: true if an export is needed before every import, otherwise false. 43 + */ 44 + static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) 45 + { 46 + /* 47 + * The misc feature indicates, among other things, that importing a 48 + * shared page from a different protected VM will automatically also 49 + * transfer its ownership. 50 + */ 51 + if (uv_has_feature(BIT_UV_FEAT_MISC)) 52 + return false; 53 + if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) 54 + return false; 55 + return atomic_read(&mm->context.protected_count) > 1; 56 + } 57 + 58 + static int __gmap_make_secure(struct gmap *gmap, struct page *page, void *uvcb) 59 + { 60 + struct folio *folio = page_folio(page); 61 + int rc; 62 + 63 + /* 64 + * Secure pages cannot be huge and userspace should not combine both. 65 + * In case userspace does it anyway this will result in an -EFAULT for 66 + * the unpack. The guest is thus never reaching secure mode. 67 + * If userspace plays dirty tricks and decides to map huge pages at a 68 + * later point in time, it will receive a segmentation fault or 69 + * KVM_RUN will return -EFAULT. 70 + */ 71 + if (folio_test_hugetlb(folio)) 72 + return -EFAULT; 73 + if (folio_test_large(folio)) { 74 + mmap_read_unlock(gmap->mm); 75 + rc = kvm_s390_wiggle_split_folio(gmap->mm, folio, true); 76 + mmap_read_lock(gmap->mm); 77 + if (rc) 78 + return rc; 79 + folio = page_folio(page); 80 + } 81 + 82 + if (!folio_trylock(folio)) 83 + return -EAGAIN; 84 + if (should_export_before_import(uvcb, gmap->mm)) 85 + uv_convert_from_secure(folio_to_phys(folio)); 86 + rc = make_folio_secure(folio, uvcb); 87 + folio_unlock(folio); 88 + 89 + /* 90 + * In theory a race is possible and the folio might have become 91 + * large again before the folio_trylock() above. In that case, no 92 + * action is performed and -EAGAIN is returned; the callers will 93 + * have to try again later. 94 + * In most cases this implies running the VM again, getting the same 95 + * exception again, and make another attempt in this function. 96 + * This is expected to happen extremely rarely. 97 + */ 98 + if (rc == -E2BIG) 99 + return -EAGAIN; 100 + /* The folio has too many references, try to shake some off */ 101 + if (rc == -EBUSY) { 102 + mmap_read_unlock(gmap->mm); 103 + kvm_s390_wiggle_split_folio(gmap->mm, folio, false); 104 + mmap_read_lock(gmap->mm); 105 + return -EAGAIN; 106 + } 107 + 108 + return rc; 109 + } 110 + 111 + /** 112 + * gmap_make_secure() - make one guest page secure 113 + * @gmap: the guest gmap 114 + * @gaddr: the guest address that needs to be made secure 115 + * @uvcb: the UVCB specifying which operation needs to be performed 116 + * 117 + * Context: needs to be called with kvm->srcu held. 118 + * Return: 0 on success, < 0 in case of error (see __gmap_make_secure()). 119 + */ 120 + int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) 121 + { 122 + struct kvm *kvm = gmap->private; 123 + struct page *page; 124 + int rc = 0; 125 + 126 + lockdep_assert_held(&kvm->srcu); 127 + 128 + page = gfn_to_page(kvm, gpa_to_gfn(gaddr)); 129 + mmap_read_lock(gmap->mm); 130 + if (page) 131 + rc = __gmap_make_secure(gmap, page, uvcb); 132 + kvm_release_page_clean(page); 133 + mmap_read_unlock(gmap->mm); 134 + 135 + return rc; 136 + } 137 + 138 + int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr) 139 + { 140 + struct uv_cb_cts uvcb = { 141 + .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, 142 + .header.len = sizeof(uvcb), 143 + .guest_handle = gmap->guest_handle, 144 + .gaddr = gaddr, 145 + }; 146 + 147 + return gmap_make_secure(gmap, gaddr, &uvcb); 148 + } 149 + 150 + /** 151 + * __gmap_destroy_page() - Destroy a guest page. 152 + * @gmap: the gmap of the guest 153 + * @page: the page to destroy 154 + * 155 + * An attempt will be made to destroy the given guest page. If the attempt 156 + * fails, an attempt is made to export the page. If both attempts fail, an 157 + * appropriate error is returned. 158 + * 159 + * Context: must be called holding the mm lock for gmap->mm 160 + */ 161 + static int __gmap_destroy_page(struct gmap *gmap, struct page *page) 162 + { 163 + struct folio *folio = page_folio(page); 164 + int rc; 165 + 166 + /* 167 + * See gmap_make_secure(): large folios cannot be secure. Small 168 + * folio implies FW_LEVEL_PTE. 169 + */ 170 + if (folio_test_large(folio)) 171 + return -EFAULT; 172 + 173 + rc = uv_destroy_folio(folio); 174 + /* 175 + * Fault handlers can race; it is possible that two CPUs will fault 176 + * on the same secure page. One CPU can destroy the page, reboot, 177 + * re-enter secure mode and import it, while the second CPU was 178 + * stuck at the beginning of the handler. At some point the second 179 + * CPU will be able to progress, and it will not be able to destroy 180 + * the page. In that case we do not want to terminate the process, 181 + * we instead try to export the page. 182 + */ 183 + if (rc) 184 + rc = uv_convert_from_secure_folio(folio); 185 + 186 + return rc; 187 + } 188 + 189 + /** 190 + * gmap_destroy_page() - Destroy a guest page. 191 + * @gmap: the gmap of the guest 192 + * @gaddr: the guest address to destroy 193 + * 194 + * An attempt will be made to destroy the given guest page. If the attempt 195 + * fails, an attempt is made to export the page. If both attempts fail, an 196 + * appropriate error is returned. 197 + * 198 + * Context: may sleep. 199 + */ 200 + int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) 201 + { 202 + struct page *page; 203 + int rc = 0; 204 + 205 + mmap_read_lock(gmap->mm); 206 + page = gfn_to_page(gmap->private, gpa_to_gfn(gaddr)); 207 + if (page) 208 + rc = __gmap_destroy_page(gmap, page); 209 + kvm_release_page_clean(page); 210 + mmap_read_unlock(gmap->mm); 211 + return rc; 212 + }

+39

arch/s390/kvm/gmap.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * KVM guest address space mapping code 4 + * 5 + * Copyright IBM Corp. 2007, 2016, 2025 6 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 7 + * Claudio Imbrenda <imbrenda@linux.ibm.com> 8 + */ 9 + 10 + #ifndef ARCH_KVM_S390_GMAP_H 11 + #define ARCH_KVM_S390_GMAP_H 12 + 13 + #define GMAP_SHADOW_FAKE_TABLE 1ULL 14 + 15 + int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); 16 + int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr); 17 + int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr); 18 + struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level); 19 + 20 + /** 21 + * gmap_shadow_valid - check if a shadow guest address space matches the 22 + * given properties and is still valid 23 + * @sg: pointer to the shadow guest address space structure 24 + * @asce: ASCE for which the shadow table is requested 25 + * @edat_level: edat level to be used for the shadow translation 26 + * 27 + * Returns 1 if the gmap shadow is still valid and matches the given 28 + * properties, the caller can continue using it. Returns 0 otherwise, the 29 + * caller has to request a new shadow gmap in this case. 30 + * 31 + */ 32 + static inline int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) 33 + { 34 + if (sg->removed) 35 + return 0; 36 + return sg->orig_asce == asce && sg->edat_level == edat_level; 37 + } 38 + 39 + #endif

+4 -3

arch/s390/kvm/intercept.c

··· 21 21 #include "gaccess.h" 22 22 #include "trace.h" 23 23 #include "trace-s390.h" 24 + #include "gmap.h" 24 25 25 26 u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu) 26 27 { ··· 368 367 reg2, &srcaddr, GACC_FETCH, 0); 369 368 if (rc) 370 369 return kvm_s390_inject_prog_cond(vcpu, rc); 371 - rc = gmap_fault(vcpu->arch.gmap, srcaddr, 0); 370 + rc = kvm_s390_handle_dat_fault(vcpu, srcaddr, 0); 372 371 if (rc != 0) 373 372 return rc; 374 373 ··· 377 376 reg1, &dstaddr, GACC_STORE, 0); 378 377 if (rc) 379 378 return kvm_s390_inject_prog_cond(vcpu, rc); 380 - rc = gmap_fault(vcpu->arch.gmap, dstaddr, FAULT_FLAG_WRITE); 379 + rc = kvm_s390_handle_dat_fault(vcpu, dstaddr, FOLL_WRITE); 381 380 if (rc != 0) 382 381 return rc; 383 382 ··· 550 549 * If the unpin did not succeed, the guest will exit again for the UVC 551 550 * and we will retry the unpin. 552 551 */ 553 - if (rc == -EINVAL) 552 + if (rc == -EINVAL || rc == -ENXIO) 554 553 return 0; 555 554 /* 556 555 * If we got -EAGAIN here, we simply return it. It will eventually

+11 -8

arch/s390/kvm/interrupt.c

··· 2893 2893 struct kvm_kernel_irq_routing_entry *e, 2894 2894 const struct kvm_irq_routing_entry *ue) 2895 2895 { 2896 - u64 uaddr; 2896 + u64 uaddr_s, uaddr_i; 2897 + int idx; 2897 2898 2898 2899 switch (ue->type) { 2899 2900 /* we store the userspace addresses instead of the guest addresses */ ··· 2902 2901 if (kvm_is_ucontrol(kvm)) 2903 2902 return -EINVAL; 2904 2903 e->set = set_adapter_int; 2905 - uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.summary_addr); 2906 - if (uaddr == -EFAULT) 2904 + 2905 + idx = srcu_read_lock(&kvm->srcu); 2906 + uaddr_s = gpa_to_hva(kvm, ue->u.adapter.summary_addr); 2907 + uaddr_i = gpa_to_hva(kvm, ue->u.adapter.ind_addr); 2908 + srcu_read_unlock(&kvm->srcu, idx); 2909 + 2910 + if (kvm_is_error_hva(uaddr_s) || kvm_is_error_hva(uaddr_i)) 2907 2911 return -EFAULT; 2908 - e->adapter.summary_addr = uaddr; 2909 - uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.ind_addr); 2910 - if (uaddr == -EFAULT) 2911 - return -EFAULT; 2912 - e->adapter.ind_addr = uaddr; 2912 + e->adapter.summary_addr = uaddr_s; 2913 + e->adapter.ind_addr = uaddr_i; 2913 2914 e->adapter.summary_offset = ue->u.adapter.summary_offset; 2914 2915 e->adapter.ind_offset = ue->u.adapter.ind_offset; 2915 2916 e->adapter.adapter_id = ue->u.adapter.adapter_id;

+197 -40

arch/s390/kvm/kvm-s390.c

··· 50 50 #include "kvm-s390.h" 51 51 #include "gaccess.h" 52 52 #include "pci.h" 53 + #include "gmap.h" 53 54 54 55 #define CREATE_TRACE_POINTS 55 56 #include "trace.h" ··· 3429 3428 VM_EVENT(kvm, 3, "vm created with type %lu", type); 3430 3429 3431 3430 if (type & KVM_VM_S390_UCONTROL) { 3431 + struct kvm_userspace_memory_region2 fake_memslot = { 3432 + .slot = KVM_S390_UCONTROL_MEMSLOT, 3433 + .guest_phys_addr = 0, 3434 + .userspace_addr = 0, 3435 + .memory_size = ALIGN_DOWN(TASK_SIZE, _SEGMENT_SIZE), 3436 + .flags = 0, 3437 + }; 3438 + 3432 3439 kvm->arch.gmap = NULL; 3433 3440 kvm->arch.mem_limit = KVM_S390_NO_MEM_LIMIT; 3441 + /* one flat fake memslot covering the whole address-space */ 3442 + mutex_lock(&kvm->slots_lock); 3443 + KVM_BUG_ON(kvm_set_internal_memslot(kvm, &fake_memslot), kvm); 3444 + mutex_unlock(&kvm->slots_lock); 3434 3445 } else { 3435 3446 if (sclp.hamax == U64_MAX) 3436 3447 kvm->arch.mem_limit = TASK_SIZE_MAX; ··· 4511 4498 return kvm_s390_test_cpuflags(vcpu, CPUSTAT_IBS); 4512 4499 } 4513 4500 4501 + static int __kvm_s390_fixup_fault_sync(struct gmap *gmap, gpa_t gaddr, unsigned int flags) 4502 + { 4503 + struct kvm *kvm = gmap->private; 4504 + gfn_t gfn = gpa_to_gfn(gaddr); 4505 + bool unlocked; 4506 + hva_t vmaddr; 4507 + gpa_t tmp; 4508 + int rc; 4509 + 4510 + if (kvm_is_ucontrol(kvm)) { 4511 + tmp = __gmap_translate(gmap, gaddr); 4512 + gfn = gpa_to_gfn(tmp); 4513 + } 4514 + 4515 + vmaddr = gfn_to_hva(kvm, gfn); 4516 + rc = fixup_user_fault(gmap->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); 4517 + if (!rc) 4518 + rc = __gmap_link(gmap, gaddr, vmaddr); 4519 + return rc; 4520 + } 4521 + 4522 + /** 4523 + * __kvm_s390_mprotect_many() - Apply specified protection to guest pages 4524 + * @gmap: the gmap of the guest 4525 + * @gpa: the starting guest address 4526 + * @npages: how many pages to protect 4527 + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 4528 + * @bits: pgste notification bits to set 4529 + * 4530 + * Returns: 0 in case of success, < 0 in case of error - see gmap_protect_one() 4531 + * 4532 + * Context: kvm->srcu and gmap->mm need to be held in read mode 4533 + */ 4534 + int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot, 4535 + unsigned long bits) 4536 + { 4537 + unsigned int fault_flag = (prot & PROT_WRITE) ? FAULT_FLAG_WRITE : 0; 4538 + gpa_t end = gpa + npages * PAGE_SIZE; 4539 + int rc; 4540 + 4541 + for (; gpa < end; gpa = ALIGN(gpa + 1, rc)) { 4542 + rc = gmap_protect_one(gmap, gpa, prot, bits); 4543 + if (rc == -EAGAIN) { 4544 + __kvm_s390_fixup_fault_sync(gmap, gpa, fault_flag); 4545 + rc = gmap_protect_one(gmap, gpa, prot, bits); 4546 + } 4547 + if (rc < 0) 4548 + return rc; 4549 + } 4550 + 4551 + return 0; 4552 + } 4553 + 4554 + static int kvm_s390_mprotect_notify_prefix(struct kvm_vcpu *vcpu) 4555 + { 4556 + gpa_t gaddr = kvm_s390_get_prefix(vcpu); 4557 + int idx, rc; 4558 + 4559 + idx = srcu_read_lock(&vcpu->kvm->srcu); 4560 + mmap_read_lock(vcpu->arch.gmap->mm); 4561 + 4562 + rc = __kvm_s390_mprotect_many(vcpu->arch.gmap, gaddr, 2, PROT_WRITE, GMAP_NOTIFY_MPROT); 4563 + 4564 + mmap_read_unlock(vcpu->arch.gmap->mm); 4565 + srcu_read_unlock(&vcpu->kvm->srcu, idx); 4566 + 4567 + return rc; 4568 + } 4569 + 4514 4570 static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) 4515 4571 { 4516 4572 retry: ··· 4595 4513 */ 4596 4514 if (kvm_check_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu)) { 4597 4515 int rc; 4598 - rc = gmap_mprotect_notify(vcpu->arch.gmap, 4599 - kvm_s390_get_prefix(vcpu), 4600 - PAGE_SIZE * 2, PROT_WRITE); 4516 + 4517 + rc = kvm_s390_mprotect_notify_prefix(vcpu); 4601 4518 if (rc) { 4602 4519 kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); 4603 4520 return rc; ··· 4847 4766 return kvm_s390_inject_prog_irq(vcpu, &pgm_info); 4848 4767 } 4849 4768 4769 + static void kvm_s390_assert_primary_as(struct kvm_vcpu *vcpu) 4770 + { 4771 + KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm, 4772 + "Unexpected program interrupt 0x%x, TEID 0x%016lx", 4773 + current->thread.gmap_int_code, current->thread.gmap_teid.val); 4774 + } 4775 + 4776 + /* 4777 + * __kvm_s390_handle_dat_fault() - handle a dat fault for the gmap of a vcpu 4778 + * @vcpu: the vCPU whose gmap is to be fixed up 4779 + * @gfn: the guest frame number used for memslots (including fake memslots) 4780 + * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps 4781 + * @flags: FOLL_* flags 4782 + * 4783 + * Return: 0 on success, < 0 in case of error. 4784 + * Context: The mm lock must not be held before calling. May sleep. 4785 + */ 4786 + int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags) 4787 + { 4788 + struct kvm_memory_slot *slot; 4789 + unsigned int fault_flags; 4790 + bool writable, unlocked; 4791 + unsigned long vmaddr; 4792 + struct page *page; 4793 + kvm_pfn_t pfn; 4794 + int rc; 4795 + 4796 + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 4797 + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 4798 + return vcpu_post_run_addressing_exception(vcpu); 4799 + 4800 + fault_flags = flags & FOLL_WRITE ? FAULT_FLAG_WRITE : 0; 4801 + if (vcpu->arch.gmap->pfault_enabled) 4802 + flags |= FOLL_NOWAIT; 4803 + vmaddr = __gfn_to_hva_memslot(slot, gfn); 4804 + 4805 + try_again: 4806 + pfn = __kvm_faultin_pfn(slot, gfn, flags, &writable, &page); 4807 + 4808 + /* Access outside memory, inject addressing exception */ 4809 + if (is_noslot_pfn(pfn)) 4810 + return vcpu_post_run_addressing_exception(vcpu); 4811 + /* Signal pending: try again */ 4812 + if (pfn == KVM_PFN_ERR_SIGPENDING) 4813 + return -EAGAIN; 4814 + 4815 + /* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT) */ 4816 + if (pfn == KVM_PFN_ERR_NEEDS_IO) { 4817 + trace_kvm_s390_major_guest_pfault(vcpu); 4818 + if (kvm_arch_setup_async_pf(vcpu)) 4819 + return 0; 4820 + vcpu->stat.pfault_sync++; 4821 + /* Could not setup async pfault, try again synchronously */ 4822 + flags &= ~FOLL_NOWAIT; 4823 + goto try_again; 4824 + } 4825 + /* Any other error */ 4826 + if (is_error_pfn(pfn)) 4827 + return -EFAULT; 4828 + 4829 + /* Success */ 4830 + mmap_read_lock(vcpu->arch.gmap->mm); 4831 + /* Mark the userspace PTEs as young and/or dirty, to avoid page fault loops */ 4832 + rc = fixup_user_fault(vcpu->arch.gmap->mm, vmaddr, fault_flags, &unlocked); 4833 + if (!rc) 4834 + rc = __gmap_link(vcpu->arch.gmap, gaddr, vmaddr); 4835 + scoped_guard(spinlock, &vcpu->kvm->mmu_lock) { 4836 + kvm_release_faultin_page(vcpu->kvm, page, false, writable); 4837 + } 4838 + mmap_read_unlock(vcpu->arch.gmap->mm); 4839 + return rc; 4840 + } 4841 + 4842 + static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int flags) 4843 + { 4844 + unsigned long gaddr_tmp; 4845 + gfn_t gfn; 4846 + 4847 + gfn = gpa_to_gfn(gaddr); 4848 + if (kvm_is_ucontrol(vcpu->kvm)) { 4849 + /* 4850 + * This translates the per-vCPU guest address into a 4851 + * fake guest address, which can then be used with the 4852 + * fake memslots that are identity mapping userspace. 4853 + * This allows ucontrol VMs to use the normal fault 4854 + * resolution path, like normal VMs. 4855 + */ 4856 + mmap_read_lock(vcpu->arch.gmap->mm); 4857 + gaddr_tmp = __gmap_translate(vcpu->arch.gmap, gaddr); 4858 + mmap_read_unlock(vcpu->arch.gmap->mm); 4859 + if (gaddr_tmp == -EFAULT) { 4860 + vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; 4861 + vcpu->run->s390_ucontrol.trans_exc_code = gaddr; 4862 + vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION; 4863 + return -EREMOTE; 4864 + } 4865 + gfn = gpa_to_gfn(gaddr_tmp); 4866 + } 4867 + return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, flags); 4868 + } 4869 + 4850 4870 static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) 4851 4871 { 4852 4872 unsigned int flags = 0; 4853 4873 unsigned long gaddr; 4854 - int rc = 0; 4855 4874 4856 4875 gaddr = current->thread.gmap_teid.addr * PAGE_SIZE; 4857 4876 if (kvm_s390_cur_gmap_fault_is_write()) ··· 4962 4781 vcpu->stat.exit_null++; 4963 4782 break; 4964 4783 case PGM_NON_SECURE_STORAGE_ACCESS: 4965 - KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm, 4966 - "Unexpected program interrupt 0x%x, TEID 0x%016lx", 4967 - current->thread.gmap_int_code, current->thread.gmap_teid.val); 4784 + kvm_s390_assert_primary_as(vcpu); 4968 4785 /* 4969 4786 * This is normal operation; a page belonging to a protected 4970 4787 * guest has not been imported yet. Try to import the page into ··· 4973 4794 break; 4974 4795 case PGM_SECURE_STORAGE_ACCESS: 4975 4796 case PGM_SECURE_STORAGE_VIOLATION: 4976 - KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm, 4977 - "Unexpected program interrupt 0x%x, TEID 0x%016lx", 4978 - current->thread.gmap_int_code, current->thread.gmap_teid.val); 4797 + kvm_s390_assert_primary_as(vcpu); 4979 4798 /* 4980 4799 * This can happen after a reboot with asynchronous teardown; 4981 4800 * the new guest (normal or protected) will run on top of the ··· 5002 4825 case PGM_REGION_FIRST_TRANS: 5003 4826 case PGM_REGION_SECOND_TRANS: 5004 4827 case PGM_REGION_THIRD_TRANS: 5005 - KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm, 5006 - "Unexpected program interrupt 0x%x, TEID 0x%016lx", 5007 - current->thread.gmap_int_code, current->thread.gmap_teid.val); 5008 - if (vcpu->arch.gmap->pfault_enabled) { 5009 - rc = gmap_fault(vcpu->arch.gmap, gaddr, flags | FAULT_FLAG_RETRY_NOWAIT); 5010 - if (rc == -EFAULT) 5011 - return vcpu_post_run_addressing_exception(vcpu); 5012 - if (rc == -EAGAIN) { 5013 - trace_kvm_s390_major_guest_pfault(vcpu); 5014 - if (kvm_arch_setup_async_pf(vcpu)) 5015 - return 0; 5016 - vcpu->stat.pfault_sync++; 5017 - } else { 5018 - return rc; 5019 - } 5020 - } 5021 - rc = gmap_fault(vcpu->arch.gmap, gaddr, flags); 5022 - if (rc == -EFAULT) { 5023 - if (kvm_is_ucontrol(vcpu->kvm)) { 5024 - vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; 5025 - vcpu->run->s390_ucontrol.trans_exc_code = gaddr; 5026 - vcpu->run->s390_ucontrol.pgm_code = 0x10; 5027 - return -EREMOTE; 5028 - } 5029 - return vcpu_post_run_addressing_exception(vcpu); 5030 - } 5031 - break; 4828 + kvm_s390_assert_primary_as(vcpu); 4829 + return vcpu_dat_fault_handler(vcpu, gaddr, flags); 5032 4830 default: 5033 4831 KVM_BUG(1, vcpu->kvm, "Unexpected program interrupt 0x%x, TEID 0x%016lx", 5034 4832 current->thread.gmap_int_code, current->thread.gmap_teid.val); 5035 4833 send_sig(SIGSEGV, current, 0); 5036 4834 break; 5037 4835 } 5038 - return rc; 4836 + return 0; 5039 4837 } 5040 4838 5041 4839 static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) ··· 5889 5737 } 5890 5738 #endif 5891 5739 case KVM_S390_VCPU_FAULT: { 5892 - r = gmap_fault(vcpu->arch.gmap, arg, 0); 5740 + idx = srcu_read_lock(&vcpu->kvm->srcu); 5741 + r = vcpu_dat_fault_handler(vcpu, arg, 0); 5742 + srcu_read_unlock(&vcpu->kvm->srcu, idx); 5893 5743 break; 5894 5744 } 5895 5745 case KVM_ENABLE_CAP: ··· 6007 5853 { 6008 5854 gpa_t size; 6009 5855 6010 - if (kvm_is_ucontrol(kvm)) 5856 + if (kvm_is_ucontrol(kvm) && new->id < KVM_USER_MEM_SLOTS) 6011 5857 return -EINVAL; 6012 5858 6013 5859 /* When we are protected, we should not change the memory slots */ ··· 6058 5904 enum kvm_mr_change change) 6059 5905 { 6060 5906 int rc = 0; 5907 + 5908 + if (kvm_is_ucontrol(kvm)) 5909 + return; 6061 5910 6062 5911 switch (change) { 6063 5912 case KVM_MR_DELETE:

+19

arch/s390/kvm/kvm-s390.h

··· 20 20 #include <asm/processor.h> 21 21 #include <asm/sclp.h> 22 22 23 + #define KVM_S390_UCONTROL_MEMSLOT (KVM_USER_MEM_SLOTS + 0) 24 + 23 25 static inline void kvm_s390_fpu_store(struct kvm_run *run) 24 26 { 25 27 fpu_stfpc(&run->s.regs.fpc); ··· 281 279 return gd; 282 280 } 283 281 282 + static inline hva_t gpa_to_hva(struct kvm *kvm, gpa_t gpa) 283 + { 284 + hva_t hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); 285 + 286 + if (!kvm_is_error_hva(hva)) 287 + hva |= offset_in_page(gpa); 288 + return hva; 289 + } 290 + 284 291 /* implemented in pv.c */ 285 292 int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); 286 293 int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); ··· 419 408 void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm); 420 409 __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu); 421 410 int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc); 411 + int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags); 412 + int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot, 413 + unsigned long bits); 414 + 415 + static inline int kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gpa_t gaddr, unsigned int flags) 416 + { 417 + return __kvm_s390_handle_dat_fault(vcpu, gpa_to_gfn(gaddr), gaddr, flags); 418 + } 422 419 423 420 /* implemented in diag.c */ 424 421 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);

+21

arch/s390/kvm/pv.c

··· 17 17 #include <linux/sched/mm.h> 18 18 #include <linux/mmu_notifier.h> 19 19 #include "kvm-s390.h" 20 + #include "gmap.h" 20 21 21 22 bool kvm_s390_pv_is_protected(struct kvm *kvm) 22 23 { ··· 639 638 .tweak[1] = offset, 640 639 }; 641 640 int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb); 641 + unsigned long vmaddr; 642 + bool unlocked; 642 643 643 644 *rc = uvcb.header.rc; 644 645 *rrc = uvcb.header.rrc; 646 + 647 + if (ret == -ENXIO) { 648 + mmap_read_lock(kvm->mm); 649 + vmaddr = gfn_to_hva(kvm, gpa_to_gfn(addr)); 650 + if (kvm_is_error_hva(vmaddr)) { 651 + ret = -EFAULT; 652 + } else { 653 + ret = fixup_user_fault(kvm->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); 654 + if (!ret) 655 + ret = __gmap_link(kvm->arch.gmap, addr, vmaddr); 656 + } 657 + mmap_read_unlock(kvm->mm); 658 + if (!ret) 659 + return -EAGAIN; 660 + return ret; 661 + } 645 662 646 663 if (ret && ret != -EAGAIN) 647 664 KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: failed addr %llx with rc %x rrc %x", ··· 678 659 679 660 KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: start addr %lx size %lx", 680 661 addr, size); 662 + 663 + guard(srcu)(&kvm->srcu); 681 664 682 665 while (offset < size) { 683 666 ret = unpack_one(kvm, addr, tweak, offset, rc, rrc);

+68 -38

arch/s390/kvm/vsie.c

··· 13 13 #include <linux/bitmap.h> 14 14 #include <linux/sched/signal.h> 15 15 #include <linux/io.h> 16 + #include <linux/mman.h> 16 17 17 18 #include <asm/gmap.h> 18 19 #include <asm/mmu_context.h> ··· 23 22 #include <asm/facility.h> 24 23 #include "kvm-s390.h" 25 24 #include "gaccess.h" 25 + #include "gmap.h" 26 + 27 + enum vsie_page_flags { 28 + VSIE_PAGE_IN_USE = 0, 29 + }; 26 30 27 31 struct vsie_page { 28 32 struct kvm_s390_sie_block scb_s; /* 0x0000 */ ··· 52 46 gpa_t gvrd_gpa; /* 0x0240 */ 53 47 gpa_t riccbd_gpa; /* 0x0248 */ 54 48 gpa_t sdnx_gpa; /* 0x0250 */ 55 - __u8 reserved[0x0700 - 0x0258]; /* 0x0258 */ 49 + /* 50 + * guest address of the original SCB. Remains set for free vsie 51 + * pages, so we can properly look them up in our addr_to_page 52 + * radix tree. 53 + */ 54 + gpa_t scb_gpa; /* 0x0258 */ 55 + /* 56 + * Flags: must be set/cleared atomically after the vsie page can be 57 + * looked up by other CPUs. 58 + */ 59 + unsigned long flags; /* 0x0260 */ 60 + __u8 reserved[0x0700 - 0x0268]; /* 0x0268 */ 56 61 struct kvm_s390_crypto_cb crycb; /* 0x0700 */ 57 62 __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ 58 63 }; ··· 601 584 struct kvm *kvm = gmap->private; 602 585 struct vsie_page *cur; 603 586 unsigned long prefix; 604 - struct page *page; 605 587 int i; 606 588 607 589 if (!gmap_is_shadow(gmap)) ··· 610 594 * therefore we can safely reference them all the time. 611 595 */ 612 596 for (i = 0; i < kvm->arch.vsie.page_count; i++) { 613 - page = READ_ONCE(kvm->arch.vsie.pages[i]); 614 - if (!page) 597 + cur = READ_ONCE(kvm->arch.vsie.pages[i]); 598 + if (!cur) 615 599 continue; 616 - cur = page_to_virt(page); 617 600 if (READ_ONCE(cur->gmap) != gmap) 618 601 continue; 619 602 prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT; ··· 1360 1345 return rc; 1361 1346 } 1362 1347 1348 + /* Try getting a given vsie page, returning "true" on success. */ 1349 + static inline bool try_get_vsie_page(struct vsie_page *vsie_page) 1350 + { 1351 + if (test_bit(VSIE_PAGE_IN_USE, &vsie_page->flags)) 1352 + return false; 1353 + return !test_and_set_bit(VSIE_PAGE_IN_USE, &vsie_page->flags); 1354 + } 1355 + 1356 + /* Put a vsie page acquired through get_vsie_page / try_get_vsie_page. */ 1357 + static void put_vsie_page(struct vsie_page *vsie_page) 1358 + { 1359 + clear_bit(VSIE_PAGE_IN_USE, &vsie_page->flags); 1360 + } 1361 + 1363 1362 /* 1364 1363 * Get or create a vsie page for a scb address. 1365 1364 * ··· 1384 1355 static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) 1385 1356 { 1386 1357 struct vsie_page *vsie_page; 1387 - struct page *page; 1388 1358 int nr_vcpus; 1389 1359 1390 1360 rcu_read_lock(); 1391 - page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9); 1361 + vsie_page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9); 1392 1362 rcu_read_unlock(); 1393 - if (page) { 1394 - if (page_ref_inc_return(page) == 2) 1395 - return page_to_virt(page); 1396 - page_ref_dec(page); 1363 + if (vsie_page) { 1364 + if (try_get_vsie_page(vsie_page)) { 1365 + if (vsie_page->scb_gpa == addr) 1366 + return vsie_page; 1367 + /* 1368 + * We raced with someone reusing + putting this vsie 1369 + * page before we grabbed it. 1370 + */ 1371 + put_vsie_page(vsie_page); 1372 + } 1397 1373 } 1398 1374 1399 1375 /* ··· 1409 1375 1410 1376 mutex_lock(&kvm->arch.vsie.mutex); 1411 1377 if (kvm->arch.vsie.page_count < nr_vcpus) { 1412 - page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA); 1413 - if (!page) { 1378 + vsie_page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA); 1379 + if (!vsie_page) { 1414 1380 mutex_unlock(&kvm->arch.vsie.mutex); 1415 1381 return ERR_PTR(-ENOMEM); 1416 1382 } 1417 - page_ref_inc(page); 1418 - kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page; 1383 + __set_bit(VSIE_PAGE_IN_USE, &vsie_page->flags); 1384 + kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = vsie_page; 1419 1385 kvm->arch.vsie.page_count++; 1420 1386 } else { 1421 1387 /* reuse an existing entry that belongs to nobody */ 1422 1388 while (true) { 1423 - page = kvm->arch.vsie.pages[kvm->arch.vsie.next]; 1424 - if (page_ref_inc_return(page) == 2) 1389 + vsie_page = kvm->arch.vsie.pages[kvm->arch.vsie.next]; 1390 + if (try_get_vsie_page(vsie_page)) 1425 1391 break; 1426 - page_ref_dec(page); 1427 1392 kvm->arch.vsie.next++; 1428 1393 kvm->arch.vsie.next %= nr_vcpus; 1429 1394 } 1430 - radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9); 1395 + if (vsie_page->scb_gpa != ULONG_MAX) 1396 + radix_tree_delete(&kvm->arch.vsie.addr_to_page, 1397 + vsie_page->scb_gpa >> 9); 1431 1398 } 1432 - page->index = addr; 1433 - /* double use of the same address */ 1434 - if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) { 1435 - page_ref_dec(page); 1399 + /* Mark it as invalid until it resides in the tree. */ 1400 + vsie_page->scb_gpa = ULONG_MAX; 1401 + 1402 + /* Double use of the same address or allocation failure. */ 1403 + if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, 1404 + vsie_page)) { 1405 + put_vsie_page(vsie_page); 1436 1406 mutex_unlock(&kvm->arch.vsie.mutex); 1437 1407 return NULL; 1438 1408 } 1409 + vsie_page->scb_gpa = addr; 1439 1410 mutex_unlock(&kvm->arch.vsie.mutex); 1440 1411 1441 - vsie_page = page_to_virt(page); 1442 1412 memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block)); 1443 1413 release_gmap_shadow(vsie_page); 1444 1414 vsie_page->fault_addr = 0; 1445 1415 vsie_page->scb_s.ihcpu = 0xffffU; 1446 1416 return vsie_page; 1447 - } 1448 - 1449 - /* put a vsie page acquired via get_vsie_page */ 1450 - static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page) 1451 - { 1452 - struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT); 1453 - 1454 - page_ref_dec(page); 1455 1417 } 1456 1418 1457 1419 int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) ··· 1500 1470 out_unpin_scb: 1501 1471 unpin_scb(vcpu, vsie_page, scb_addr); 1502 1472 out_put: 1503 - put_vsie_page(vcpu->kvm, vsie_page); 1473 + put_vsie_page(vsie_page); 1504 1474 1505 1475 return rc < 0 ? rc : 0; 1506 1476 } ··· 1516 1486 void kvm_s390_vsie_destroy(struct kvm *kvm) 1517 1487 { 1518 1488 struct vsie_page *vsie_page; 1519 - struct page *page; 1520 1489 int i; 1521 1490 1522 1491 mutex_lock(&kvm->arch.vsie.mutex); 1523 1492 for (i = 0; i < kvm->arch.vsie.page_count; i++) { 1524 - page = kvm->arch.vsie.pages[i]; 1493 + vsie_page = kvm->arch.vsie.pages[i]; 1525 1494 kvm->arch.vsie.pages[i] = NULL; 1526 - vsie_page = page_to_virt(page); 1527 1495 release_gmap_shadow(vsie_page); 1528 1496 /* free the radix tree entry */ 1529 - radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9); 1530 - __free_page(page); 1497 + if (vsie_page->scb_gpa != ULONG_MAX) 1498 + radix_tree_delete(&kvm->arch.vsie.addr_to_page, 1499 + vsie_page->scb_gpa >> 9); 1500 + free_page((unsigned long)vsie_page); 1531 1501 } 1532 1502 kvm->arch.vsie.page_count = 0; 1533 1503 mutex_unlock(&kvm->arch.vsie.mutex);

+150 -531

arch/s390/mm/gmap.c

··· 24 24 #include <asm/page.h> 25 25 #include <asm/tlb.h> 26 26 27 + /* 28 + * The address is saved in a radix tree directly; NULL would be ambiguous, 29 + * since 0 is a valid address, and NULL is returned when nothing was found. 30 + * The lower bits are ignored by all users of the macro, so it can be used 31 + * to distinguish a valid address 0 from a NULL. 32 + */ 33 + #define VALID_GADDR_FLAG 1 34 + #define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG) 35 + #define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG) 36 + 27 37 #define GMAP_SHADOW_FAKE_TABLE 1ULL 28 38 29 39 static struct page *gmap_alloc_crst(void) ··· 53 43 * 54 44 * Returns a guest address space structure. 55 45 */ 56 - static struct gmap *gmap_alloc(unsigned long limit) 46 + struct gmap *gmap_alloc(unsigned long limit) 57 47 { 58 48 struct gmap *gmap; 59 49 struct page *page; ··· 80 70 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT); 81 71 if (!gmap) 82 72 goto out; 83 - INIT_LIST_HEAD(&gmap->crst_list); 84 73 INIT_LIST_HEAD(&gmap->children); 85 - INIT_LIST_HEAD(&gmap->pt_list); 86 74 INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT); 87 75 INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT); 88 76 INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT); ··· 90 82 page = gmap_alloc_crst(); 91 83 if (!page) 92 84 goto out_free; 93 - page->index = 0; 94 - list_add(&page->lru, &gmap->crst_list); 95 85 table = page_to_virt(page); 96 86 crst_table_init(table, etype); 97 87 gmap->table = table; ··· 103 97 out: 104 98 return NULL; 105 99 } 100 + EXPORT_SYMBOL_GPL(gmap_alloc); 106 101 107 102 /** 108 103 * gmap_create - create a guest address space ··· 192 185 } while (nr > 0); 193 186 } 194 187 188 + static void gmap_free_crst(unsigned long *table, bool free_ptes) 189 + { 190 + bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0; 191 + int i; 192 + 193 + if (is_segment) { 194 + if (!free_ptes) 195 + goto out; 196 + for (i = 0; i < _CRST_ENTRIES; i++) 197 + if (!(table[i] & _SEGMENT_ENTRY_INVALID)) 198 + page_table_free_pgste(page_ptdesc(phys_to_page(table[i]))); 199 + } else { 200 + for (i = 0; i < _CRST_ENTRIES; i++) 201 + if (!(table[i] & _REGION_ENTRY_INVALID)) 202 + gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes); 203 + } 204 + 205 + out: 206 + free_pages((unsigned long)table, CRST_ALLOC_ORDER); 207 + } 208 + 195 209 /** 196 210 * gmap_free - free a guest address space 197 211 * @gmap: pointer to the guest address space structure 198 212 * 199 213 * No locks required. There are no references to this gmap anymore. 200 214 */ 201 - static void gmap_free(struct gmap *gmap) 215 + void gmap_free(struct gmap *gmap) 202 216 { 203 - struct page *page, *next; 204 - 205 217 /* Flush tlb of all gmaps (if not already done for shadows) */ 206 218 if (!(gmap_is_shadow(gmap) && gmap->removed)) 207 219 gmap_flush_tlb(gmap); 208 220 /* Free all segment & region tables. */ 209 - list_for_each_entry_safe(page, next, &gmap->crst_list, lru) 210 - __free_pages(page, CRST_ALLOC_ORDER); 221 + gmap_free_crst(gmap->table, gmap_is_shadow(gmap)); 222 + 211 223 gmap_radix_tree_free(&gmap->guest_to_host); 212 224 gmap_radix_tree_free(&gmap->host_to_guest); 213 225 214 226 /* Free additional data for a shadow gmap */ 215 227 if (gmap_is_shadow(gmap)) { 216 - struct ptdesc *ptdesc, *n; 217 - 218 - /* Free all page tables. */ 219 - list_for_each_entry_safe(ptdesc, n, &gmap->pt_list, pt_list) 220 - page_table_free_pgste(ptdesc); 221 228 gmap_rmap_radix_tree_free(&gmap->host_to_rmap); 222 229 /* Release reference to the parent */ 223 230 gmap_put(gmap->parent); ··· 239 218 240 219 kfree(gmap); 241 220 } 221 + EXPORT_SYMBOL_GPL(gmap_free); 242 222 243 223 /** 244 224 * gmap_get - increase reference counter for guest address space ··· 320 298 crst_table_init(new, init); 321 299 spin_lock(&gmap->guest_table_lock); 322 300 if (*table & _REGION_ENTRY_INVALID) { 323 - list_add(&page->lru, &gmap->crst_list); 324 301 *table = __pa(new) | _REGION_ENTRY_LENGTH | 325 302 (*table & _REGION_ENTRY_TYPE_MASK); 326 - page->index = gaddr; 327 303 page = NULL; 328 304 } 329 305 spin_unlock(&gmap->guest_table_lock); ··· 330 310 return 0; 331 311 } 332 312 333 - /** 334 - * __gmap_segment_gaddr - find virtual address from segment pointer 335 - * @entry: pointer to a segment table entry in the guest address space 336 - * 337 - * Returns the virtual address in the guest address space for the segment 338 - */ 339 - static unsigned long __gmap_segment_gaddr(unsigned long *entry) 313 + static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr) 340 314 { 341 - struct page *page; 342 - unsigned long offset; 315 + return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); 316 + } 343 317 344 - offset = (unsigned long) entry / sizeof(unsigned long); 345 - offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; 346 - page = pmd_pgtable_page((pmd_t *) entry); 347 - return page->index + offset; 318 + static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr) 319 + { 320 + return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); 321 + } 322 + 323 + static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr, 324 + unsigned long *gaddr) 325 + { 326 + *gaddr = host_to_guest_delete(gmap, vmaddr); 327 + if (IS_GADDR_VALID(*gaddr)) 328 + return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1); 329 + return NULL; 348 330 } 349 331 350 332 /** ··· 358 336 */ 359 337 static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) 360 338 { 361 - unsigned long *entry; 339 + unsigned long gaddr; 362 340 int flush = 0; 341 + pmd_t *pmdp; 363 342 364 343 BUG_ON(gmap_is_shadow(gmap)); 365 344 spin_lock(&gmap->guest_table_lock); 366 - entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); 367 - if (entry) { 368 - flush = (*entry != _SEGMENT_ENTRY_EMPTY); 369 - *entry = _SEGMENT_ENTRY_EMPTY; 345 + 346 + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); 347 + if (pmdp) { 348 + flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY); 349 + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); 370 350 } 351 + 371 352 spin_unlock(&gmap->guest_table_lock); 372 353 return flush; 373 354 } ··· 489 464 EXPORT_SYMBOL_GPL(__gmap_translate); 490 465 491 466 /** 492 - * gmap_translate - translate a guest address to a user space address 493 - * @gmap: pointer to guest mapping meta data structure 494 - * @gaddr: guest address 495 - * 496 - * Returns user space address which corresponds to the guest address or 497 - * -EFAULT if no such mapping exists. 498 - * This function does not establish potentially missing page table entries. 499 - */ 500 - unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) 501 - { 502 - unsigned long rc; 503 - 504 - mmap_read_lock(gmap->mm); 505 - rc = __gmap_translate(gmap, gaddr); 506 - mmap_read_unlock(gmap->mm); 507 - return rc; 508 - } 509 - EXPORT_SYMBOL_GPL(gmap_translate); 510 - 511 - /** 512 467 * gmap_unlink - disconnect a page table from the gmap shadow tables 513 468 * @mm: pointer to the parent mm_struct 514 469 * @table: pointer to the host page table ··· 587 582 spin_lock(&gmap->guest_table_lock); 588 583 if (*table == _SEGMENT_ENTRY_EMPTY) { 589 584 rc = radix_tree_insert(&gmap->host_to_guest, 590 - vmaddr >> PMD_SHIFT, table); 585 + vmaddr >> PMD_SHIFT, 586 + (void *)MAKE_VALID_GADDR(gaddr)); 591 587 if (!rc) { 592 588 if (pmd_leaf(*pmd)) { 593 589 *table = (pmd_val(*pmd) & ··· 611 605 radix_tree_preload_end(); 612 606 return rc; 613 607 } 614 - 615 - /** 616 - * fixup_user_fault_nowait - manually resolve a user page fault without waiting 617 - * @mm: mm_struct of target mm 618 - * @address: user address 619 - * @fault_flags:flags to pass down to handle_mm_fault() 620 - * @unlocked: did we unlock the mmap_lock while retrying 621 - * 622 - * This function behaves similarly to fixup_user_fault(), but it guarantees 623 - * that the fault will be resolved without waiting. The function might drop 624 - * and re-acquire the mm lock, in which case @unlocked will be set to true. 625 - * 626 - * The guarantee is that the fault is handled without waiting, but the 627 - * function itself might sleep, due to the lock. 628 - * 629 - * Context: Needs to be called with mm->mmap_lock held in read mode, and will 630 - * return with the lock held in read mode; @unlocked will indicate whether 631 - * the lock has been dropped and re-acquired. This is the same behaviour as 632 - * fixup_user_fault(). 633 - * 634 - * Return: 0 on success, -EAGAIN if the fault cannot be resolved without 635 - * waiting, -EFAULT if the fault cannot be resolved, -ENOMEM if out of 636 - * memory. 637 - */ 638 - static int fixup_user_fault_nowait(struct mm_struct *mm, unsigned long address, 639 - unsigned int fault_flags, bool *unlocked) 640 - { 641 - struct vm_area_struct *vma; 642 - unsigned int test_flags; 643 - vm_fault_t fault; 644 - int rc; 645 - 646 - fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; 647 - test_flags = fault_flags & FAULT_FLAG_WRITE ? VM_WRITE : VM_READ; 648 - 649 - vma = find_vma(mm, address); 650 - if (unlikely(!vma || address < vma->vm_start)) 651 - return -EFAULT; 652 - if (unlikely(!(vma->vm_flags & test_flags))) 653 - return -EFAULT; 654 - 655 - fault = handle_mm_fault(vma, address, fault_flags, NULL); 656 - /* the mm lock has been dropped, take it again */ 657 - if (fault & VM_FAULT_COMPLETED) { 658 - *unlocked = true; 659 - mmap_read_lock(mm); 660 - return 0; 661 - } 662 - /* the mm lock has not been dropped */ 663 - if (fault & VM_FAULT_ERROR) { 664 - rc = vm_fault_to_errno(fault, 0); 665 - BUG_ON(!rc); 666 - return rc; 667 - } 668 - /* the mm lock has not been dropped because of FAULT_FLAG_RETRY_NOWAIT */ 669 - if (fault & VM_FAULT_RETRY) 670 - return -EAGAIN; 671 - /* nothing needed to be done and the mm lock has not been dropped */ 672 - return 0; 673 - } 674 - 675 - /** 676 - * __gmap_fault - resolve a fault on a guest address 677 - * @gmap: pointer to guest mapping meta data structure 678 - * @gaddr: guest address 679 - * @fault_flags: flags to pass down to handle_mm_fault() 680 - * 681 - * Context: Needs to be called with mm->mmap_lock held in read mode. Might 682 - * drop and re-acquire the lock. Will always return with the lock held. 683 - */ 684 - static int __gmap_fault(struct gmap *gmap, unsigned long gaddr, unsigned int fault_flags) 685 - { 686 - unsigned long vmaddr; 687 - bool unlocked; 688 - int rc = 0; 689 - 690 - retry: 691 - unlocked = false; 692 - 693 - vmaddr = __gmap_translate(gmap, gaddr); 694 - if (IS_ERR_VALUE(vmaddr)) 695 - return vmaddr; 696 - 697 - if (fault_flags & FAULT_FLAG_RETRY_NOWAIT) 698 - rc = fixup_user_fault_nowait(gmap->mm, vmaddr, fault_flags, &unlocked); 699 - else 700 - rc = fixup_user_fault(gmap->mm, vmaddr, fault_flags, &unlocked); 701 - if (rc) 702 - return rc; 703 - /* 704 - * In the case that fixup_user_fault unlocked the mmap_lock during 705 - * fault-in, redo __gmap_translate() to avoid racing with a 706 - * map/unmap_segment. 707 - * In particular, __gmap_translate(), fixup_user_fault{,_nowait}(), 708 - * and __gmap_link() must all be called atomically in one go; if the 709 - * lock had been dropped in between, a retry is needed. 710 - */ 711 - if (unlocked) 712 - goto retry; 713 - 714 - return __gmap_link(gmap, gaddr, vmaddr); 715 - } 716 - 717 - /** 718 - * gmap_fault - resolve a fault on a guest address 719 - * @gmap: pointer to guest mapping meta data structure 720 - * @gaddr: guest address 721 - * @fault_flags: flags to pass down to handle_mm_fault() 722 - * 723 - * Returns 0 on success, -ENOMEM for out of memory conditions, -EFAULT if the 724 - * vm address is already mapped to a different guest segment, and -EAGAIN if 725 - * FAULT_FLAG_RETRY_NOWAIT was specified and the fault could not be processed 726 - * immediately. 727 - */ 728 - int gmap_fault(struct gmap *gmap, unsigned long gaddr, unsigned int fault_flags) 729 - { 730 - int rc; 731 - 732 - mmap_read_lock(gmap->mm); 733 - rc = __gmap_fault(gmap, gaddr, fault_flags); 734 - mmap_read_unlock(gmap->mm); 735 - return rc; 736 - } 737 - EXPORT_SYMBOL_GPL(gmap_fault); 608 + EXPORT_SYMBOL(__gmap_link); 738 609 739 610 /* 740 611 * this function is assumed to be called with mmap_lock held ··· 736 853 * 737 854 * Note: Can also be called for shadow gmaps. 738 855 */ 739 - static inline unsigned long *gmap_table_walk(struct gmap *gmap, 740 - unsigned long gaddr, int level) 856 + unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level) 741 857 { 742 858 const int asce_type = gmap->asce & _ASCE_TYPE_MASK; 743 859 unsigned long *table = gmap->table; ··· 787 905 } 788 906 return table; 789 907 } 908 + EXPORT_SYMBOL(gmap_table_walk); 790 909 791 910 /** 792 911 * gmap_pte_op_walk - walk the gmap page table, get the page table lock ··· 984 1101 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 985 1102 * @bits: pgste notification bits to set 986 1103 * 987 - * Returns 0 if successfully protected, -ENOMEM if out of memory and 988 - * -EFAULT if gaddr is invalid (or mapping for shadows is missing). 1104 + * Returns: 1105 + * PAGE_SIZE if a small page was successfully protected; 1106 + * HPAGE_SIZE if a large page was successfully protected; 1107 + * -ENOMEM if out of memory; 1108 + * -EFAULT if gaddr is invalid (or mapping for shadows is missing); 1109 + * -EAGAIN if the guest mapping is missing and should be fixed by the caller. 989 1110 * 990 - * Called with sg->mm->mmap_lock in read. 1111 + * Context: Called with sg->mm->mmap_lock in read. 991 1112 */ 992 - static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, 993 - unsigned long len, int prot, unsigned long bits) 1113 + int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits) 994 1114 { 995 - unsigned long vmaddr, dist; 996 1115 pmd_t *pmdp; 997 - int rc; 1116 + int rc = 0; 998 1117 999 1118 BUG_ON(gmap_is_shadow(gmap)); 1000 - while (len) { 1001 - rc = -EAGAIN; 1002 - pmdp = gmap_pmd_op_walk(gmap, gaddr); 1003 - if (pmdp) { 1004 - if (!pmd_leaf(*pmdp)) { 1005 - rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, 1006 - bits); 1007 - if (!rc) { 1008 - len -= PAGE_SIZE; 1009 - gaddr += PAGE_SIZE; 1010 - } 1011 - } else { 1012 - rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, 1013 - bits); 1014 - if (!rc) { 1015 - dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK); 1016 - len = len < dist ? 0 : len - dist; 1017 - gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE; 1018 - } 1019 - } 1020 - gmap_pmd_op_end(gmap, pmdp); 1021 - } 1022 - if (rc) { 1023 - if (rc == -EINVAL) 1024 - return rc; 1025 1119 1026 - /* -EAGAIN, fixup of userspace mm and gmap */ 1027 - vmaddr = __gmap_translate(gmap, gaddr); 1028 - if (IS_ERR_VALUE(vmaddr)) 1029 - return vmaddr; 1030 - rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot); 1031 - if (rc) 1032 - return rc; 1033 - } 1120 + pmdp = gmap_pmd_op_walk(gmap, gaddr); 1121 + if (!pmdp) 1122 + return -EAGAIN; 1123 + 1124 + if (!pmd_leaf(*pmdp)) { 1125 + rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits); 1126 + if (!rc) 1127 + rc = PAGE_SIZE; 1128 + } else { 1129 + rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits); 1130 + if (!rc) 1131 + rc = HPAGE_SIZE; 1034 1132 } 1035 - return 0; 1036 - } 1133 + gmap_pmd_op_end(gmap, pmdp); 1037 1134 1038 - /** 1039 - * gmap_mprotect_notify - change access rights for a range of ptes and 1040 - * call the notifier if any pte changes again 1041 - * @gmap: pointer to guest mapping meta data structure 1042 - * @gaddr: virtual address in the guest address space 1043 - * @len: size of area 1044 - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 1045 - * 1046 - * Returns 0 if for each page in the given range a gmap mapping exists, 1047 - * the new access rights could be set and the notifier could be armed. 1048 - * If the gmap mapping is missing for one or more pages -EFAULT is 1049 - * returned. If no memory could be allocated -ENOMEM is returned. 1050 - * This function establishes missing page table entries. 1051 - */ 1052 - int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, 1053 - unsigned long len, int prot) 1054 - { 1055 - int rc; 1056 - 1057 - if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap)) 1058 - return -EINVAL; 1059 - if (!MACHINE_HAS_ESOP && prot == PROT_READ) 1060 - return -EINVAL; 1061 - mmap_read_lock(gmap->mm); 1062 - rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT); 1063 - mmap_read_unlock(gmap->mm); 1064 1135 return rc; 1065 1136 } 1066 - EXPORT_SYMBOL_GPL(gmap_mprotect_notify); 1137 + EXPORT_SYMBOL_GPL(gmap_protect_one); 1067 1138 1068 1139 /** 1069 1140 * gmap_read_table - get an unsigned long value from a guest page table using ··· 1251 1414 __gmap_unshadow_pgt(sg, raddr, __va(pgt)); 1252 1415 /* Free page table */ 1253 1416 ptdesc = page_ptdesc(phys_to_page(pgt)); 1254 - list_del(&ptdesc->pt_list); 1255 1417 page_table_free_pgste(ptdesc); 1256 1418 } 1257 1419 ··· 1278 1442 __gmap_unshadow_pgt(sg, raddr, __va(pgt)); 1279 1443 /* Free page table */ 1280 1444 ptdesc = page_ptdesc(phys_to_page(pgt)); 1281 - list_del(&ptdesc->pt_list); 1282 1445 page_table_free_pgste(ptdesc); 1283 1446 } 1284 1447 } ··· 1307 1472 __gmap_unshadow_sgt(sg, raddr, __va(sgt)); 1308 1473 /* Free segment table */ 1309 1474 page = phys_to_page(sgt); 1310 - list_del(&page->lru); 1311 1475 __free_pages(page, CRST_ALLOC_ORDER); 1312 1476 } 1313 1477 ··· 1334 1500 __gmap_unshadow_sgt(sg, raddr, __va(sgt)); 1335 1501 /* Free segment table */ 1336 1502 page = phys_to_page(sgt); 1337 - list_del(&page->lru); 1338 1503 __free_pages(page, CRST_ALLOC_ORDER); 1339 1504 } 1340 1505 } ··· 1363 1530 __gmap_unshadow_r3t(sg, raddr, __va(r3t)); 1364 1531 /* Free region 3 table */ 1365 1532 page = phys_to_page(r3t); 1366 - list_del(&page->lru); 1367 1533 __free_pages(page, CRST_ALLOC_ORDER); 1368 1534 } 1369 1535 ··· 1390 1558 __gmap_unshadow_r3t(sg, raddr, __va(r3t)); 1391 1559 /* Free region 3 table */ 1392 1560 page = phys_to_page(r3t); 1393 - list_del(&page->lru); 1394 1561 __free_pages(page, CRST_ALLOC_ORDER); 1395 1562 } 1396 1563 } ··· 1419 1588 __gmap_unshadow_r2t(sg, raddr, __va(r2t)); 1420 1589 /* Free region 2 table */ 1421 1590 page = phys_to_page(r2t); 1422 - list_del(&page->lru); 1423 1591 __free_pages(page, CRST_ALLOC_ORDER); 1424 1592 } 1425 1593 ··· 1450 1620 r1t[i] = _REGION1_ENTRY_EMPTY; 1451 1621 /* Free region 2 table */ 1452 1622 page = phys_to_page(r2t); 1453 - list_del(&page->lru); 1454 1623 __free_pages(page, CRST_ALLOC_ORDER); 1455 1624 } 1456 1625 } ··· 1460 1631 * 1461 1632 * Called with sg->guest_table_lock 1462 1633 */ 1463 - static void gmap_unshadow(struct gmap *sg) 1634 + void gmap_unshadow(struct gmap *sg) 1464 1635 { 1465 1636 unsigned long *table; 1466 1637 ··· 1486 1657 break; 1487 1658 } 1488 1659 } 1489 - 1490 - /** 1491 - * gmap_find_shadow - find a specific asce in the list of shadow tables 1492 - * @parent: pointer to the parent gmap 1493 - * @asce: ASCE for which the shadow table is created 1494 - * @edat_level: edat level to be used for the shadow translation 1495 - * 1496 - * Returns the pointer to a gmap if a shadow table with the given asce is 1497 - * already available, ERR_PTR(-EAGAIN) if another one is just being created, 1498 - * otherwise NULL 1499 - */ 1500 - static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, 1501 - int edat_level) 1502 - { 1503 - struct gmap *sg; 1504 - 1505 - list_for_each_entry(sg, &parent->children, list) { 1506 - if (sg->orig_asce != asce || sg->edat_level != edat_level || 1507 - sg->removed) 1508 - continue; 1509 - if (!sg->initialized) 1510 - return ERR_PTR(-EAGAIN); 1511 - refcount_inc(&sg->ref_count); 1512 - return sg; 1513 - } 1514 - return NULL; 1515 - } 1516 - 1517 - /** 1518 - * gmap_shadow_valid - check if a shadow guest address space matches the 1519 - * given properties and is still valid 1520 - * @sg: pointer to the shadow guest address space structure 1521 - * @asce: ASCE for which the shadow table is requested 1522 - * @edat_level: edat level to be used for the shadow translation 1523 - * 1524 - * Returns 1 if the gmap shadow is still valid and matches the given 1525 - * properties, the caller can continue using it. Returns 0 otherwise, the 1526 - * caller has to request a new shadow gmap in this case. 1527 - * 1528 - */ 1529 - int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) 1530 - { 1531 - if (sg->removed) 1532 - return 0; 1533 - return sg->orig_asce == asce && sg->edat_level == edat_level; 1534 - } 1535 - EXPORT_SYMBOL_GPL(gmap_shadow_valid); 1536 - 1537 - /** 1538 - * gmap_shadow - create/find a shadow guest address space 1539 - * @parent: pointer to the parent gmap 1540 - * @asce: ASCE for which the shadow table is created 1541 - * @edat_level: edat level to be used for the shadow translation 1542 - * 1543 - * The pages of the top level page table referred by the asce parameter 1544 - * will be set to read-only and marked in the PGSTEs of the kvm process. 1545 - * The shadow table will be removed automatically on any change to the 1546 - * PTE mapping for the source table. 1547 - * 1548 - * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, 1549 - * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the 1550 - * parent gmap table could not be protected. 1551 - */ 1552 - struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, 1553 - int edat_level) 1554 - { 1555 - struct gmap *sg, *new; 1556 - unsigned long limit; 1557 - int rc; 1558 - 1559 - BUG_ON(parent->mm->context.allow_gmap_hpage_1m); 1560 - BUG_ON(gmap_is_shadow(parent)); 1561 - spin_lock(&parent->shadow_lock); 1562 - sg = gmap_find_shadow(parent, asce, edat_level); 1563 - spin_unlock(&parent->shadow_lock); 1564 - if (sg) 1565 - return sg; 1566 - /* Create a new shadow gmap */ 1567 - limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); 1568 - if (asce & _ASCE_REAL_SPACE) 1569 - limit = -1UL; 1570 - new = gmap_alloc(limit); 1571 - if (!new) 1572 - return ERR_PTR(-ENOMEM); 1573 - new->mm = parent->mm; 1574 - new->parent = gmap_get(parent); 1575 - new->private = parent->private; 1576 - new->orig_asce = asce; 1577 - new->edat_level = edat_level; 1578 - new->initialized = false; 1579 - spin_lock(&parent->shadow_lock); 1580 - /* Recheck if another CPU created the same shadow */ 1581 - sg = gmap_find_shadow(parent, asce, edat_level); 1582 - if (sg) { 1583 - spin_unlock(&parent->shadow_lock); 1584 - gmap_free(new); 1585 - return sg; 1586 - } 1587 - if (asce & _ASCE_REAL_SPACE) { 1588 - /* only allow one real-space gmap shadow */ 1589 - list_for_each_entry(sg, &parent->children, list) { 1590 - if (sg->orig_asce & _ASCE_REAL_SPACE) { 1591 - spin_lock(&sg->guest_table_lock); 1592 - gmap_unshadow(sg); 1593 - spin_unlock(&sg->guest_table_lock); 1594 - list_del(&sg->list); 1595 - gmap_put(sg); 1596 - break; 1597 - } 1598 - } 1599 - } 1600 - refcount_set(&new->ref_count, 2); 1601 - list_add(&new->list, &parent->children); 1602 - if (asce & _ASCE_REAL_SPACE) { 1603 - /* nothing to protect, return right away */ 1604 - new->initialized = true; 1605 - spin_unlock(&parent->shadow_lock); 1606 - return new; 1607 - } 1608 - spin_unlock(&parent->shadow_lock); 1609 - /* protect after insertion, so it will get properly invalidated */ 1610 - mmap_read_lock(parent->mm); 1611 - rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, 1612 - ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, 1613 - PROT_READ, GMAP_NOTIFY_SHADOW); 1614 - mmap_read_unlock(parent->mm); 1615 - spin_lock(&parent->shadow_lock); 1616 - new->initialized = true; 1617 - if (rc) { 1618 - list_del(&new->list); 1619 - gmap_free(new); 1620 - new = ERR_PTR(rc); 1621 - } 1622 - spin_unlock(&parent->shadow_lock); 1623 - return new; 1624 - } 1625 - EXPORT_SYMBOL_GPL(gmap_shadow); 1660 + EXPORT_SYMBOL(gmap_unshadow); 1626 1661 1627 1662 /** 1628 1663 * gmap_shadow_r2t - create an empty shadow region 2 table ··· 1520 1827 page = gmap_alloc_crst(); 1521 1828 if (!page) 1522 1829 return -ENOMEM; 1523 - page->index = r2t & _REGION_ENTRY_ORIGIN; 1524 - if (fake) 1525 - page->index |= GMAP_SHADOW_FAKE_TABLE; 1526 1830 s_r2t = page_to_phys(page); 1527 1831 /* Install shadow region second table */ 1528 1832 spin_lock(&sg->guest_table_lock); ··· 1541 1851 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; 1542 1852 if (sg->edat_level >= 1) 1543 1853 *table |= (r2t & _REGION_ENTRY_PROTECT); 1544 - list_add(&page->lru, &sg->crst_list); 1545 1854 if (fake) { 1546 1855 /* nothing to protect for fake tables */ 1547 1856 *table &= ~_REGION_ENTRY_INVALID; ··· 1600 1911 page = gmap_alloc_crst(); 1601 1912 if (!page) 1602 1913 return -ENOMEM; 1603 - page->index = r3t & _REGION_ENTRY_ORIGIN; 1604 - if (fake) 1605 - page->index |= GMAP_SHADOW_FAKE_TABLE; 1606 1914 s_r3t = page_to_phys(page); 1607 1915 /* Install shadow region second table */ 1608 1916 spin_lock(&sg->guest_table_lock); ··· 1621 1935 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; 1622 1936 if (sg->edat_level >= 1) 1623 1937 *table |= (r3t & _REGION_ENTRY_PROTECT); 1624 - list_add(&page->lru, &sg->crst_list); 1625 1938 if (fake) { 1626 1939 /* nothing to protect for fake tables */ 1627 1940 *table &= ~_REGION_ENTRY_INVALID; ··· 1680 1995 page = gmap_alloc_crst(); 1681 1996 if (!page) 1682 1997 return -ENOMEM; 1683 - page->index = sgt & _REGION_ENTRY_ORIGIN; 1684 - if (fake) 1685 - page->index |= GMAP_SHADOW_FAKE_TABLE; 1686 1998 s_sgt = page_to_phys(page); 1687 1999 /* Install shadow region second table */ 1688 2000 spin_lock(&sg->guest_table_lock); ··· 1701 2019 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; 1702 2020 if (sg->edat_level >= 1) 1703 2021 *table |= sgt & _REGION_ENTRY_PROTECT; 1704 - list_add(&page->lru, &sg->crst_list); 1705 2022 if (fake) { 1706 2023 /* nothing to protect for fake tables */ 1707 2024 *table &= ~_REGION_ENTRY_INVALID; ··· 1733 2052 } 1734 2053 EXPORT_SYMBOL_GPL(gmap_shadow_sgt); 1735 2054 1736 - /** 1737 - * gmap_shadow_pgt_lookup - find a shadow page table 1738 - * @sg: pointer to the shadow guest address space structure 1739 - * @saddr: the address in the shadow aguest address space 1740 - * @pgt: parent gmap address of the page table to get shadowed 1741 - * @dat_protection: if the pgtable is marked as protected by dat 1742 - * @fake: pgt references contiguous guest memory block, not a pgtable 1743 - * 1744 - * Returns 0 if the shadow page table was found and -EAGAIN if the page 1745 - * table was not found. 1746 - * 1747 - * Called with sg->mm->mmap_lock in read. 1748 - */ 1749 - int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, 1750 - unsigned long *pgt, int *dat_protection, 1751 - int *fake) 2055 + static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr) 1752 2056 { 1753 - unsigned long *table; 1754 - struct page *page; 1755 - int rc; 2057 + unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc)); 1756 2058 1757 - BUG_ON(!gmap_is_shadow(sg)); 1758 - spin_lock(&sg->guest_table_lock); 1759 - table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ 1760 - if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { 1761 - /* Shadow page tables are full pages (pte+pgste) */ 1762 - page = pfn_to_page(*table >> PAGE_SHIFT); 1763 - *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE; 1764 - *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); 1765 - *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE); 1766 - rc = 0; 1767 - } else { 1768 - rc = -EAGAIN; 1769 - } 1770 - spin_unlock(&sg->guest_table_lock); 1771 - return rc; 2059 + pgstes += _PAGE_ENTRIES; 1772 2060 2061 + pgstes[0] &= ~PGSTE_ST2_MASK; 2062 + pgstes[1] &= ~PGSTE_ST2_MASK; 2063 + pgstes[2] &= ~PGSTE_ST2_MASK; 2064 + pgstes[3] &= ~PGSTE_ST2_MASK; 2065 + 2066 + pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK; 2067 + pgstes[1] |= pgt_addr & PGSTE_ST2_MASK; 2068 + pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK; 2069 + pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK; 1773 2070 } 1774 - EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); 1775 2071 1776 2072 /** 1777 2073 * gmap_shadow_pgt - instantiate a shadow page table ··· 1777 2119 ptdesc = page_table_alloc_pgste(sg->mm); 1778 2120 if (!ptdesc) 1779 2121 return -ENOMEM; 1780 - ptdesc->pt_index = pgt & _SEGMENT_ENTRY_ORIGIN; 2122 + origin = pgt & _SEGMENT_ENTRY_ORIGIN; 1781 2123 if (fake) 1782 - ptdesc->pt_index |= GMAP_SHADOW_FAKE_TABLE; 2124 + origin |= GMAP_SHADOW_FAKE_TABLE; 2125 + gmap_pgste_set_pgt_addr(ptdesc, origin); 1783 2126 s_pgt = page_to_phys(ptdesc_page(ptdesc)); 1784 2127 /* Install shadow page table */ 1785 2128 spin_lock(&sg->guest_table_lock); ··· 1799 2140 /* mark as invalid as long as the parent table is not protected */ 1800 2141 *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | 1801 2142 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; 1802 - list_add(&ptdesc->pt_list, &sg->pt_list); 1803 2143 if (fake) { 1804 2144 /* nothing to protect for fake tables */ 1805 2145 *table &= ~_SEGMENT_ENTRY_INVALID; ··· 1976 2318 pte_t *pte, unsigned long bits) 1977 2319 { 1978 2320 unsigned long offset, gaddr = 0; 1979 - unsigned long *table; 1980 2321 struct gmap *gmap, *sg, *next; 1981 2322 1982 2323 offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); ··· 1983 2326 rcu_read_lock(); 1984 2327 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 1985 2328 spin_lock(&gmap->guest_table_lock); 1986 - table = radix_tree_lookup(&gmap->host_to_guest, 1987 - vmaddr >> PMD_SHIFT); 1988 - if (table) 1989 - gaddr = __gmap_segment_gaddr(table) + offset; 2329 + gaddr = host_to_guest_lookup(gmap, vmaddr) + offset; 1990 2330 spin_unlock(&gmap->guest_table_lock); 1991 - if (!table) 2331 + if (!IS_GADDR_VALID(gaddr)) 1992 2332 continue; 1993 2333 1994 2334 if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { ··· 2045 2391 rcu_read_lock(); 2046 2392 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2047 2393 spin_lock(&gmap->guest_table_lock); 2048 - pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest, 2049 - vmaddr >> PMD_SHIFT); 2394 + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); 2050 2395 if (pmdp) { 2051 - gaddr = __gmap_segment_gaddr((unsigned long *)pmdp); 2052 2396 pmdp_notify_gmap(gmap, pmdp, gaddr); 2053 2397 WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2054 2398 _SEGMENT_ENTRY_GMAP_UC | ··· 2090 2438 */ 2091 2439 void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) 2092 2440 { 2093 - unsigned long *entry, gaddr; 2441 + unsigned long gaddr; 2094 2442 struct gmap *gmap; 2095 2443 pmd_t *pmdp; 2096 2444 2097 2445 rcu_read_lock(); 2098 2446 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2099 2447 spin_lock(&gmap->guest_table_lock); 2100 - entry = radix_tree_delete(&gmap->host_to_guest, 2101 - vmaddr >> PMD_SHIFT); 2102 - if (entry) { 2103 - pmdp = (pmd_t *)entry; 2104 - gaddr = __gmap_segment_gaddr(entry); 2448 + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); 2449 + if (pmdp) { 2105 2450 pmdp_notify_gmap(gmap, pmdp, gaddr); 2106 - WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2107 - _SEGMENT_ENTRY_GMAP_UC | 2108 - _SEGMENT_ENTRY)); 2451 + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2452 + _SEGMENT_ENTRY_GMAP_UC | 2453 + _SEGMENT_ENTRY)); 2109 2454 if (MACHINE_HAS_TLB_GUEST) 2110 2455 __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, 2111 2456 gmap->asce, IDTE_LOCAL); 2112 2457 else if (MACHINE_HAS_IDTE) 2113 2458 __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); 2114 - *entry = _SEGMENT_ENTRY_EMPTY; 2459 + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); 2115 2460 } 2116 2461 spin_unlock(&gmap->guest_table_lock); 2117 2462 } ··· 2123 2474 */ 2124 2475 void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) 2125 2476 { 2126 - unsigned long *entry, gaddr; 2477 + unsigned long gaddr; 2127 2478 struct gmap *gmap; 2128 2479 pmd_t *pmdp; 2129 2480 2130 2481 rcu_read_lock(); 2131 2482 list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { 2132 2483 spin_lock(&gmap->guest_table_lock); 2133 - entry = radix_tree_delete(&gmap->host_to_guest, 2134 - vmaddr >> PMD_SHIFT); 2135 - if (entry) { 2136 - pmdp = (pmd_t *)entry; 2137 - gaddr = __gmap_segment_gaddr(entry); 2484 + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); 2485 + if (pmdp) { 2138 2486 pmdp_notify_gmap(gmap, pmdp, gaddr); 2139 - WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2140 - _SEGMENT_ENTRY_GMAP_UC | 2141 - _SEGMENT_ENTRY)); 2487 + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | 2488 + _SEGMENT_ENTRY_GMAP_UC | 2489 + _SEGMENT_ENTRY)); 2142 2490 if (MACHINE_HAS_TLB_GUEST) 2143 2491 __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, 2144 2492 gmap->asce, IDTE_GLOBAL); ··· 2143 2497 __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); 2144 2498 else 2145 2499 __pmdp_csp(pmdp); 2146 - *entry = _SEGMENT_ENTRY_EMPTY; 2500 + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); 2147 2501 } 2148 2502 spin_unlock(&gmap->guest_table_lock); 2149 2503 } ··· 2589 2943 EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); 2590 2944 2591 2945 /** 2592 - * s390_unlist_old_asce - Remove the topmost level of page tables from the 2593 - * list of page tables of the gmap. 2594 - * @gmap: the gmap whose table is to be removed 2595 - * 2596 - * On s390x, KVM keeps a list of all pages containing the page tables of the 2597 - * gmap (the CRST list). This list is used at tear down time to free all 2598 - * pages that are now not needed anymore. 2599 - * 2600 - * This function removes the topmost page of the tree (the one pointed to by 2601 - * the ASCE) from the CRST list. 2602 - * 2603 - * This means that it will not be freed when the VM is torn down, and needs 2604 - * to be handled separately by the caller, unless a leak is actually 2605 - * intended. Notice that this function will only remove the page from the 2606 - * list, the page will still be used as a top level page table (and ASCE). 2607 - */ 2608 - void s390_unlist_old_asce(struct gmap *gmap) 2609 - { 2610 - struct page *old; 2611 - 2612 - old = virt_to_page(gmap->table); 2613 - spin_lock(&gmap->guest_table_lock); 2614 - list_del(&old->lru); 2615 - /* 2616 - * Sometimes the topmost page might need to be "removed" multiple 2617 - * times, for example if the VM is rebooted into secure mode several 2618 - * times concurrently, or if s390_replace_asce fails after calling 2619 - * s390_remove_old_asce and is attempted again later. In that case 2620 - * the old asce has been removed from the list, and therefore it 2621 - * will not be freed when the VM terminates, but the ASCE is still 2622 - * in use and still pointed to. 2623 - * A subsequent call to replace_asce will follow the pointer and try 2624 - * to remove the same page from the list again. 2625 - * Therefore it's necessary that the page of the ASCE has valid 2626 - * pointers, so list_del can work (and do nothing) without 2627 - * dereferencing stale or invalid pointers. 2628 - */ 2629 - INIT_LIST_HEAD(&old->lru); 2630 - spin_unlock(&gmap->guest_table_lock); 2631 - } 2632 - EXPORT_SYMBOL_GPL(s390_unlist_old_asce); 2633 - 2634 - /** 2635 2946 * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy 2636 2947 * @gmap: the gmap whose ASCE needs to be replaced 2637 2948 * ··· 2607 3004 struct page *page; 2608 3005 void *table; 2609 3006 2610 - s390_unlist_old_asce(gmap); 2611 - 2612 3007 /* Replacing segment type ASCEs would cause serious issues */ 2613 3008 if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) 2614 3009 return -EINVAL; ··· 2614 3013 page = gmap_alloc_crst(); 2615 3014 if (!page) 2616 3015 return -ENOMEM; 2617 - page->index = 0; 2618 3016 table = page_to_virt(page); 2619 3017 memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); 2620 - 2621 - /* 2622 - * The caller has to deal with the old ASCE, but here we make sure 2623 - * the new one is properly added to the CRST list, so that 2624 - * it will be freed when the VM is torn down. 2625 - */ 2626 - spin_lock(&gmap->guest_table_lock); 2627 - list_add(&page->lru, &gmap->crst_list); 2628 - spin_unlock(&gmap->guest_table_lock); 2629 3018 2630 3019 /* Set new table origin while preserving existing ASCE control bits */ 2631 3020 asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); ··· 2626 3035 return 0; 2627 3036 } 2628 3037 EXPORT_SYMBOL_GPL(s390_replace_asce); 3038 + 3039 + /** 3040 + * kvm_s390_wiggle_split_folio() - try to drain extra references to a folio and optionally split 3041 + * @mm: the mm containing the folio to work on 3042 + * @folio: the folio 3043 + * @split: whether to split a large folio 3044 + * 3045 + * Context: Must be called while holding an extra reference to the folio; 3046 + * the mm lock should not be held. 3047 + */ 3048 + int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split) 3049 + { 3050 + int rc; 3051 + 3052 + lockdep_assert_not_held(&mm->mmap_lock); 3053 + folio_wait_writeback(folio); 3054 + lru_add_drain_all(); 3055 + if (split) { 3056 + folio_lock(folio); 3057 + rc = split_folio(folio); 3058 + folio_unlock(folio); 3059 + 3060 + if (rc != -EBUSY) 3061 + return rc; 3062 + } 3063 + return -EAGAIN; 3064 + } 3065 + EXPORT_SYMBOL_GPL(kvm_s390_wiggle_split_folio);

-2

arch/s390/mm/pgalloc.c

··· 176 176 } 177 177 table = ptdesc_to_virt(ptdesc); 178 178 __arch_set_page_dat(table, 1); 179 - /* pt_list is used by gmap only */ 180 - INIT_LIST_HEAD(&ptdesc->pt_list); 181 179 memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); 182 180 memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); 183 181 return table;

+1 -1

arch/x86/kvm/cpuid.c

··· 1180 1180 SYNTHESIZED_F(SBPB), 1181 1181 SYNTHESIZED_F(IBPB_BRTYPE), 1182 1182 SYNTHESIZED_F(SRSO_NO), 1183 - SYNTHESIZED_F(SRSO_USER_KERNEL_NO), 1183 + F(SRSO_USER_KERNEL_NO), 1184 1184 ); 1185 1185 1186 1186 kvm_cpu_cap_init(CPUID_8000_0022_EAX,

+26 -7

arch/x86/kvm/mmu/mmu.c

··· 7120 7120 kmem_cache_destroy(mmu_page_header_cache); 7121 7121 } 7122 7122 7123 + static void kvm_wake_nx_recovery_thread(struct kvm *kvm) 7124 + { 7125 + /* 7126 + * The NX recovery thread is spawned on-demand at the first KVM_RUN and 7127 + * may not be valid even though the VM is globally visible. Do nothing, 7128 + * as such a VM can't have any possible NX huge pages. 7129 + */ 7130 + struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread); 7131 + 7132 + if (nx_thread) 7133 + vhost_task_wake(nx_thread); 7134 + } 7135 + 7123 7136 static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) 7124 7137 { 7125 7138 if (nx_hugepage_mitigation_hard_disabled) ··· 7193 7180 kvm_mmu_zap_all_fast(kvm); 7194 7181 mutex_unlock(&kvm->slots_lock); 7195 7182 7196 - vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); 7183 + kvm_wake_nx_recovery_thread(kvm); 7197 7184 } 7198 7185 mutex_unlock(&kvm_lock); 7199 7186 } ··· 7328 7315 mutex_lock(&kvm_lock); 7329 7316 7330 7317 list_for_each_entry(kvm, &vm_list, vm_list) 7331 - vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); 7318 + kvm_wake_nx_recovery_thread(kvm); 7332 7319 7333 7320 mutex_unlock(&kvm_lock); 7334 7321 } ··· 7464 7451 { 7465 7452 struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once); 7466 7453 struct kvm *kvm = container_of(ka, struct kvm, arch); 7454 + struct vhost_task *nx_thread; 7467 7455 7468 7456 kvm->arch.nx_huge_page_last = get_jiffies_64(); 7469 - kvm->arch.nx_huge_page_recovery_thread = vhost_task_create( 7470 - kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill, 7471 - kvm, "kvm-nx-lpage-recovery"); 7457 + nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker, 7458 + kvm_nx_huge_page_recovery_worker_kill, 7459 + kvm, "kvm-nx-lpage-recovery"); 7472 7460 7473 - if (kvm->arch.nx_huge_page_recovery_thread) 7474 - vhost_task_start(kvm->arch.nx_huge_page_recovery_thread); 7461 + if (!nx_thread) 7462 + return; 7463 + 7464 + vhost_task_start(nx_thread); 7465 + 7466 + /* Make the task visible only once it is fully started. */ 7467 + WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread); 7475 7468 } 7476 7469 7477 7470 int kvm_mmu_post_init_vm(struct kvm *kvm)

+1 -6

arch/x86/kvm/x86.c

··· 12741 12741 "does not run without ignore_msrs=1, please report it to kvm@vger.kernel.org.\n"); 12742 12742 } 12743 12743 12744 + once_init(&kvm->arch.nx_once); 12744 12745 return 0; 12745 12746 12746 12747 out_uninit_mmu: ··· 12749 12748 kvm_page_track_cleanup(kvm); 12750 12749 out: 12751 12750 return ret; 12752 - } 12753 - 12754 - int kvm_arch_post_init_vm(struct kvm *kvm) 12755 - { 12756 - once_init(&kvm->arch.nx_once); 12757 - return 0; 12758 12751 } 12759 12752 12760 12753 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)

-1

include/linux/kvm_host.h

··· 1615 1615 bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); 1616 1616 bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu); 1617 1617 bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu); 1618 - int kvm_arch_post_init_vm(struct kvm *kvm); 1619 1618 void kvm_arch_pre_destroy_vm(struct kvm *kvm); 1620 1619 void kvm_arch_create_vm_debugfs(struct kvm *kvm); 1621 1620

+2 -2

tools/testing/selftests/kvm/s390/cmma_test.c

··· 444 444 ); 445 445 } 446 446 447 - static void test_get_inital_dirty(void) 447 + static void test_get_initial_dirty(void) 448 448 { 449 449 struct kvm_vm *vm = create_vm_two_memslots(); 450 450 struct kvm_vcpu *vcpu; ··· 651 651 } testlist[] = { 652 652 { "migration mode and dirty tracking", test_migration_mode }, 653 653 { "GET_CMMA_BITS: basic calls", test_get_cmma_basic }, 654 - { "GET_CMMA_BITS: all pages are dirty initally", test_get_inital_dirty }, 654 + { "GET_CMMA_BITS: all pages are dirty initially", test_get_initial_dirty }, 655 655 { "GET_CMMA_BITS: holes are skipped", test_get_skip_holes }, 656 656 }; 657 657

+12 -20

tools/testing/selftests/kvm/s390/ucontrol_test.c

··· 88 88 " ahi %r0,1\n" 89 89 " st %r1,0(%r5,%r6)\n" 90 90 91 - " iske %r1,%r6\n" 92 - " ahi %r0,1\n" 93 - " diag 0,0,0x44\n" 94 - 95 91 " sske %r1,%r6\n" 96 92 " xgr %r1,%r1\n" 97 93 " iske %r1,%r6\n" ··· 455 459 }; 456 460 457 461 ASSERT_EQ(-1, ioctl(self->vm_fd, KVM_SET_USER_MEMORY_REGION, &region)); 458 - ASSERT_EQ(EINVAL, errno); 462 + ASSERT_TRUE(errno == EEXIST || errno == EINVAL) 463 + TH_LOG("errno %s (%i) not expected for ioctl KVM_SET_USER_MEMORY_REGION", 464 + strerror(errno), errno); 459 465 460 466 ASSERT_EQ(-1, ioctl(self->vm_fd, KVM_SET_USER_MEMORY_REGION2, &region2)); 461 - ASSERT_EQ(EINVAL, errno); 467 + ASSERT_TRUE(errno == EEXIST || errno == EINVAL) 468 + TH_LOG("errno %s (%i) not expected for ioctl KVM_SET_USER_MEMORY_REGION2", 469 + strerror(errno), errno); 462 470 } 463 471 464 472 TEST_F(uc_kvm, uc_map_unmap) ··· 596 596 ASSERT_EQ(true, uc_handle_exit(self)); 597 597 ASSERT_EQ(1, sync_regs->gprs[0]); 598 598 599 - /* ISKE */ 599 + /* SSKE + ISKE */ 600 + sync_regs->gprs[1] = skeyvalue; 601 + run->kvm_dirty_regs |= KVM_SYNC_GPRS; 600 602 ASSERT_EQ(0, uc_run_once(self)); 601 603 602 604 /* ··· 610 608 TEST_ASSERT_EQ(0, sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)); 611 609 TEST_ASSERT_EQ(KVM_EXIT_S390_SIEIC, self->run->exit_reason); 612 610 TEST_ASSERT_EQ(ICPT_INST, sie_block->icptcode); 613 - TEST_REQUIRE(sie_block->ipa != 0xb229); 611 + TEST_REQUIRE(sie_block->ipa != 0xb22b); 614 612 615 - /* ISKE contd. */ 613 + /* SSKE + ISKE contd. */ 616 614 ASSERT_EQ(false, uc_handle_exit(self)); 617 615 ASSERT_EQ(2, sync_regs->gprs[0]); 618 - /* assert initial skey (ACC = 0, R & C = 1) */ 619 - ASSERT_EQ(0x06, sync_regs->gprs[1]); 620 - uc_assert_diag44(self); 621 - 622 - /* SSKE + ISKE */ 623 - sync_regs->gprs[1] = skeyvalue; 624 - run->kvm_dirty_regs |= KVM_SYNC_GPRS; 625 - ASSERT_EQ(0, uc_run_once(self)); 626 - ASSERT_EQ(false, uc_handle_exit(self)); 627 - ASSERT_EQ(3, sync_regs->gprs[0]); 628 616 ASSERT_EQ(skeyvalue, sync_regs->gprs[1]); 629 617 uc_assert_diag44(self); 630 618 ··· 623 631 run->kvm_dirty_regs |= KVM_SYNC_GPRS; 624 632 ASSERT_EQ(0, uc_run_once(self)); 625 633 ASSERT_EQ(false, uc_handle_exit(self)); 626 - ASSERT_EQ(4, sync_regs->gprs[0]); 634 + ASSERT_EQ(3, sync_regs->gprs[0]); 627 635 /* assert R reset but rest of skey unchanged */ 628 636 ASSERT_EQ(skeyvalue & 0xfa, sync_regs->gprs[1]); 629 637 ASSERT_EQ(0, sync_regs->gprs[1] & 0x04);

+9 -16

virt/kvm/kvm_main.c

··· 1071 1071 } 1072 1072 1073 1073 /* 1074 - * Called after the VM is otherwise initialized, but just before adding it to 1075 - * the vm_list. 1076 - */ 1077 - int __weak kvm_arch_post_init_vm(struct kvm *kvm) 1078 - { 1079 - return 0; 1080 - } 1081 - 1082 - /* 1083 1074 * Called just after removing the VM from the vm_list, but before doing any 1084 1075 * other destruction. 1085 1076 */ ··· 1190 1199 if (r) 1191 1200 goto out_err_no_debugfs; 1192 1201 1193 - r = kvm_arch_post_init_vm(kvm); 1194 - if (r) 1195 - goto out_err; 1196 - 1197 1202 mutex_lock(&kvm_lock); 1198 1203 list_add(&kvm->vm_list, &vm_list); 1199 1204 mutex_unlock(&kvm_lock); ··· 1199 1212 1200 1213 return kvm; 1201 1214 1202 - out_err: 1203 - kvm_destroy_vm_debugfs(kvm); 1204 1215 out_err_no_debugfs: 1205 1216 kvm_coalesced_mmio_free(kvm); 1206 1217 out_no_coalesced_mmio: ··· 1956 1971 return -EINVAL; 1957 1972 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 1958 1973 return -EINVAL; 1959 - if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES) 1974 + 1975 + /* 1976 + * The size of userspace-defined memory regions is restricted in order 1977 + * to play nice with dirty bitmap operations, which are indexed with an 1978 + * "unsigned int". KVM's internal memory regions don't support dirty 1979 + * logging, and so are exempt. 1980 + */ 1981 + if (id < KVM_USER_MEM_SLOTS && 1982 + (mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES) 1960 1983 return -EINVAL; 1961 1984 1962 1985 slots = __kvm_memslots(kvm, as_id);

Configure Feed

Configure Feed