Merge tag 'kvmarm-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD

+4 -4

Documentation/arm64/memory.rst

··· 33 33 0000000000000000 0000ffffffffffff 256TB user 34 34 ffff000000000000 ffff7fffffffffff 128TB kernel logical memory map 35 35 [ffff600000000000 ffff7fffffffffff] 32TB [kasan shadow region] 36 - ffff800000000000 ffff800007ffffff 128MB modules 37 - ffff800008000000 fffffbffefffffff 124TB vmalloc 36 + ffff800000000000 ffff80007fffffff 2GB modules 37 + ffff800080000000 fffffbffefffffff 124TB vmalloc 38 38 fffffbfff0000000 fffffbfffdffffff 224MB fixed mappings (top down) 39 39 fffffbfffe000000 fffffbfffe7fffff 8MB [guard region] 40 40 fffffbfffe800000 fffffbffff7fffff 16MB PCI I/O space ··· 50 50 0000000000000000 000fffffffffffff 4PB user 51 51 fff0000000000000 ffff7fffffffffff ~4PB kernel logical memory map 52 52 [fffd800000000000 ffff7fffffffffff] 512TB [kasan shadow region] 53 - ffff800000000000 ffff800007ffffff 128MB modules 54 - ffff800008000000 fffffbffefffffff 124TB vmalloc 53 + ffff800000000000 ffff80007fffffff 2GB modules 54 + ffff800080000000 fffffbffefffffff 124TB vmalloc 55 55 fffffbfff0000000 fffffbfffdffffff 224MB fixed mappings (top down) 56 56 fffffbfffe000000 fffffbfffe7fffff 8MB [guard region] 57 57 fffffbfffe800000 fffffbffff7fffff 16MB PCI I/O space

+3

Documentation/arm64/silicon-errata.rst

··· 52 52 | Allwinner | A64/R18 | UNKNOWN1 | SUN50I_ERRATUM_UNKNOWN1 | 53 53 +----------------+-----------------+-----------------+-----------------------------+ 54 54 +----------------+-----------------+-----------------+-----------------------------+ 55 + | Ampere | AmpereOne | AC03_CPU_38 | AMPERE_ERRATUM_AC03_CPU_38 | 56 + +----------------+-----------------+-----------------+-----------------------------+ 57 + +----------------+-----------------+-----------------+-----------------------------+ 55 58 | ARM | Cortex-A510 | #2457168 | ARM64_ERRATUM_2457168 | 56 59 +----------------+-----------------+-----------------+-----------------------------+ 57 60 | ARM | Cortex-A510 | #2064142 | ARM64_ERRATUM_2064142 |

+27

Documentation/virt/kvm/api.rst

··· 8445 8445 When getting the Modified Change Topology Report value, the attr->addr 8446 8446 must point to a byte where the value will be stored or retrieved from. 8447 8447 8448 + 8.40 KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 8449 + --------------------------------------- 8450 + 8451 + :Capability: KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 8452 + :Architectures: arm64 8453 + :Type: vm 8454 + :Parameters: arg[0] is the new split chunk size. 8455 + :Returns: 0 on success, -EINVAL if any memslot was already created. 8456 + 8457 + This capability sets the chunk size used in Eager Page Splitting. 8458 + 8459 + Eager Page Splitting improves the performance of dirty-logging (used 8460 + in live migrations) when guest memory is backed by huge-pages. It 8461 + avoids splitting huge-pages (into PAGE_SIZE pages) on fault, by doing 8462 + it eagerly when enabling dirty logging (with the 8463 + KVM_MEM_LOG_DIRTY_PAGES flag for a memory region), or when using 8464 + KVM_CLEAR_DIRTY_LOG. 8465 + 8466 + The chunk size specifies how many pages to break at a time, using a 8467 + single allocation for each chunk. Bigger the chunk size, more pages 8468 + need to be allocated ahead of time. 8469 + 8470 + The chunk size needs to be a valid block size. The list of acceptable 8471 + block sizes is exposed in KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES as a 8472 + 64-bit bitmap (each bit describing a block size). The default value is 8473 + 0, to disable the eager page splitting. 8474 + 8448 8475 9. Known KVM API problems 8449 8476 ========================= 8450 8477

+22 -25

arch/arm64/Kconfig

··· 207 207 select HAVE_IOREMAP_PROT 208 208 select HAVE_IRQ_TIME_ACCOUNTING 209 209 select HAVE_KVM 210 + select HAVE_MOD_ARCH_SPECIFIC 210 211 select HAVE_NMI 211 212 select HAVE_PERF_EVENTS 212 213 select HAVE_PERF_REGS ··· 407 406 408 407 menu "ARM errata workarounds via the alternatives framework" 409 408 409 + config AMPERE_ERRATUM_AC03_CPU_38 410 + bool "AmpereOne: AC03_CPU_38: Certain bits in the Virtualization Translation Control Register and Translation Control Registers do not follow RES0 semantics" 411 + default y 412 + help 413 + This option adds an alternative code sequence to work around Ampere 414 + erratum AC03_CPU_38 on AmpereOne. 415 + 416 + The affected design reports FEAT_HAFDBS as not implemented in 417 + ID_AA64MMFR1_EL1.HAFDBS, but (V)TCR_ELx.{HA,HD} are not RES0 418 + as required by the architecture. The unadvertised HAFDBS 419 + implementation suffers from an additional erratum where hardware 420 + A/D updates can occur after a PTE has been marked invalid. 421 + 422 + The workaround forces KVM to explicitly set VTCR_EL2.HA to 0, 423 + which avoids enabling unadvertised hardware Access Flag management 424 + at stage-2. 425 + 426 + If unsure, say Y. 427 + 410 428 config ARM64_WORKAROUND_CLEAN_CACHE 411 429 bool 412 430 ··· 597 577 config ARM64_ERRATUM_843419 598 578 bool "Cortex-A53: 843419: A load or store might access an incorrect address" 599 579 default y 600 - select ARM64_MODULE_PLTS if MODULES 601 580 help 602 581 This option links the kernel with '--fix-cortex-a53-843419' and 603 582 enables PLT support to replace certain ADRP instructions, which can ··· 2126 2107 register state capable of holding two dimensional matrix tiles to 2127 2108 enable various matrix operations. 2128 2109 2129 - config ARM64_MODULE_PLTS 2130 - bool "Use PLTs to allow module memory to spill over into vmalloc area" 2131 - depends on MODULES 2132 - select HAVE_MOD_ARCH_SPECIFIC 2133 - help 2134 - Allocate PLTs when loading modules so that jumps and calls whose 2135 - targets are too far away for their relative offsets to be encoded 2136 - in the instructions themselves can be bounced via veneers in the 2137 - module's PLT. This allows modules to be allocated in the generic 2138 - vmalloc area after the dedicated module memory area has been 2139 - exhausted. 2140 - 2141 - When running with address space randomization (KASLR), the module 2142 - region itself may be too far away for ordinary relative jumps and 2143 - calls, and so in that case, module PLTs are required and cannot be 2144 - disabled. 2145 - 2146 - Specific errata workaround(s) might also force module PLTs to be 2147 - enabled (ARM64_ERRATUM_843419). 2148 - 2149 2110 config ARM64_PSEUDO_NMI 2150 2111 bool "Support for NMI-like interrupts" 2151 2112 select ARM_GIC_V3 ··· 2166 2167 2167 2168 config RANDOMIZE_BASE 2168 2169 bool "Randomize the address of the kernel image" 2169 - select ARM64_MODULE_PLTS if MODULES 2170 2170 select RELOCATABLE 2171 2171 help 2172 2172 Randomizes the virtual address at which the kernel image is ··· 2196 2198 When this option is not set, the module region will be randomized over 2197 2199 a limited range that contains the [_stext, _etext] interval of the 2198 2200 core kernel, so branch relocations are almost always in range unless 2199 - ARM64_MODULE_PLTS is enabled and the region is exhausted. In this 2200 - particular case of region exhaustion, modules might be able to fall 2201 - back to a larger 2GB area. 2201 + the region is exhausted. In this particular case of region 2202 + exhaustion, modules might be able to fall back to a larger 2GB area. 2202 2203 2203 2204 config CC_HAVE_STACKPROTECTOR_SYSREG 2204 2205 def_bool $(cc-option,-mstack-protector-guard=sysreg -mstack-protector-guard-reg=sp_el0 -mstack-protector-guard-offset=0)

+6

arch/arm64/include/asm/cpufeature.h

··· 15 15 #define MAX_CPU_FEATURES 128 16 16 #define cpu_feature(x) KERNEL_HWCAP_ ## x 17 17 18 + #define ARM64_SW_FEATURE_OVERRIDE_NOKASLR 0 19 + #define ARM64_SW_FEATURE_OVERRIDE_HVHE 4 20 + 18 21 #ifndef __ASSEMBLY__ 19 22 20 23 #include <linux/bug.h> ··· 918 915 return 8; 919 916 } 920 917 918 + s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, s64 cur); 921 919 struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id); 922 920 923 921 extern struct arm64_ftr_override id_aa64mmfr1_override; ··· 928 924 extern struct arm64_ftr_override id_aa64smfr0_override; 929 925 extern struct arm64_ftr_override id_aa64isar1_override; 930 926 extern struct arm64_ftr_override id_aa64isar2_override; 927 + 928 + extern struct arm64_ftr_override arm64_sw_feature_override; 931 929 932 930 u32 get_kvm_ipa_limit(void); 933 931 void dump_cpu_features(void);

+24 -3

arch/arm64/include/asm/el2_setup.h

··· 34 34 */ 35 35 .macro __init_el2_timers 36 36 mov x0, #3 // Enable EL1 physical timers 37 + mrs x1, hcr_el2 38 + and x1, x1, #HCR_E2H 39 + cbz x1, .LnVHE_\@ 40 + lsl x0, x0, #10 41 + .LnVHE_\@: 37 42 msr cnthctl_el2, x0 38 43 msr cntvoff_el2, xzr // Clear virtual offset 39 44 .endm ··· 129 124 .endm 130 125 131 126 /* Coprocessor traps */ 132 - .macro __init_el2_nvhe_cptr 127 + .macro __init_el2_cptr 128 + mrs x1, hcr_el2 129 + and x1, x1, #HCR_E2H 130 + cbz x1, .LnVHE_\@ 131 + mov x0, #(CPACR_EL1_FPEN_EL1EN | CPACR_EL1_FPEN_EL0EN) 132 + b .Lset_cptr_\@ 133 + .LnVHE_\@: 133 134 mov x0, #0x33ff 135 + .Lset_cptr_\@: 134 136 msr cptr_el2, x0 // Disable copro. traps to EL2 135 137 .endm 136 138 ··· 203 191 __init_el2_gicv3 204 192 __init_el2_hstr 205 193 __init_el2_nvhe_idregs 206 - __init_el2_nvhe_cptr 194 + __init_el2_cptr 207 195 __init_el2_fgt 208 - __init_el2_nvhe_prepare_eret 209 196 .endm 210 197 211 198 #ifndef __KVM_NVHE_HYPERVISOR__ ··· 250 239 251 240 .Linit_sve_\@: /* SVE register access */ 252 241 mrs x0, cptr_el2 // Disable SVE traps 242 + mrs x1, hcr_el2 243 + and x1, x1, #HCR_E2H 244 + cbz x1, .Lcptr_nvhe_\@ 245 + 246 + // VHE case 247 + orr x0, x0, #(CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN) 248 + b .Lset_cptr_\@ 249 + 250 + .Lcptr_nvhe_\@: // nVHE case 253 251 bic x0, x0, #CPTR_EL2_TZ 252 + .Lset_cptr_\@: 254 253 msr cptr_el2, x0 255 254 isb 256 255 mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector

+3 -4

arch/arm64/include/asm/kvm_arm.h

··· 18 18 #define HCR_ATA_SHIFT 56 19 19 #define HCR_ATA (UL(1) << HCR_ATA_SHIFT) 20 20 #define HCR_AMVOFFEN (UL(1) << 51) 21 + #define HCR_TID4 (UL(1) << 49) 21 22 #define HCR_FIEN (UL(1) << 47) 22 23 #define HCR_FWB (UL(1) << 46) 23 24 #define HCR_API (UL(1) << 41) ··· 87 86 #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \ 88 87 HCR_BSU_IS | HCR_FB | HCR_TACR | \ 89 88 HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \ 90 - HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3 | HCR_TID2) 89 + HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3) 91 90 #define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF) 92 91 #define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA) 93 92 #define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC) ··· 286 285 #define CPTR_EL2_TFP (1 << CPTR_EL2_TFP_SHIFT) 287 286 #define CPTR_EL2_TZ (1 << 8) 288 287 #define CPTR_NVHE_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 (nVHE) */ 289 - #define CPTR_EL2_DEFAULT CPTR_NVHE_EL2_RES1 290 288 #define CPTR_NVHE_EL2_RES0 (GENMASK(63, 32) | \ 291 289 GENMASK(29, 21) | \ 292 290 GENMASK(19, 14) | \ ··· 347 347 ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \ 348 348 ECN(BKPT32), ECN(VECTOR32), ECN(BRK64), ECN(ERET) 349 349 350 - #define CPACR_EL1_DEFAULT (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN |\ 351 - CPACR_EL1_ZEN_EL1EN) 350 + #define CPACR_EL1_TTA (1 << 28) 352 351 353 352 #define kvm_mode_names \ 354 353 { PSR_MODE_EL0t, "EL0t" }, \

+4

arch/arm64/include/asm/kvm_asm.h

··· 68 68 __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run, 69 69 __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context, 70 70 __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa, 71 + __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa_nsh, 71 72 __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid, 72 73 __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context, 73 74 __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff, ··· 226 225 extern void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu); 227 226 extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, 228 227 int level); 228 + extern void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, 229 + phys_addr_t ipa, 230 + int level); 229 231 extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu); 230 232 231 233 extern void __kvm_timer_set_cntvoff(u64 cntvoff);

+39 -7

arch/arm64/include/asm/kvm_emulate.h

··· 62 62 #else 63 63 static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) 64 64 { 65 - struct kvm *kvm = vcpu->kvm; 66 - 67 - WARN_ON_ONCE(!test_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, 68 - &kvm->arch.flags)); 69 - 70 - return test_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags); 65 + return test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features); 71 66 } 72 67 #endif 73 68 74 69 static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) 75 70 { 76 71 vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; 77 - if (is_kernel_in_hyp_mode()) 72 + if (has_vhe() || has_hvhe()) 78 73 vcpu->arch.hcr_el2 |= HCR_E2H; 79 74 if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) { 80 75 /* route synchronous external abort exceptions to EL2 */ ··· 89 94 */ 90 95 vcpu->arch.hcr_el2 |= HCR_TVM; 91 96 } 97 + 98 + if (cpus_have_final_cap(ARM64_HAS_EVT) && 99 + !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE)) 100 + vcpu->arch.hcr_el2 |= HCR_TID4; 101 + else 102 + vcpu->arch.hcr_el2 |= HCR_TID2; 92 103 93 104 if (vcpu_el1_is_32bit(vcpu)) 94 105 vcpu->arch.hcr_el2 &= ~HCR_RW; ··· 571 570 return test_bit(feature, vcpu->arch.features); 572 571 } 573 572 573 + static __always_inline u64 kvm_get_reset_cptr_el2(struct kvm_vcpu *vcpu) 574 + { 575 + u64 val; 576 + 577 + if (has_vhe()) { 578 + val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN | 579 + CPACR_EL1_ZEN_EL1EN); 580 + } else if (has_hvhe()) { 581 + val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN); 582 + } else { 583 + val = CPTR_NVHE_EL2_RES1; 584 + 585 + if (vcpu_has_sve(vcpu) && 586 + (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED)) 587 + val |= CPTR_EL2_TZ; 588 + if (cpus_have_final_cap(ARM64_SME)) 589 + val &= ~CPTR_EL2_TSM; 590 + } 591 + 592 + return val; 593 + } 594 + 595 + static __always_inline void kvm_reset_cptr_el2(struct kvm_vcpu *vcpu) 596 + { 597 + u64 val = kvm_get_reset_cptr_el2(vcpu); 598 + 599 + if (has_vhe() || has_hvhe()) 600 + write_sysreg(val, cpacr_el1); 601 + else 602 + write_sysreg(val, cptr_el2); 603 + } 574 604 #endif /* __ARM64_KVM_EMULATE_H__ */

+41 -20

arch/arm64/include/asm/kvm_host.h

··· 39 39 #define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS 40 40 41 41 #define KVM_VCPU_MAX_FEATURES 7 42 + #define KVM_VCPU_VALID_FEATURES (BIT(KVM_VCPU_MAX_FEATURES) - 1) 42 43 43 44 #define KVM_REQ_SLEEP \ 44 45 KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) ··· 160 159 /* The last vcpu id that ran on each physical CPU */ 161 160 int __percpu *last_vcpu_ran; 162 161 162 + #define KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT 0 163 + /* 164 + * Memory cache used to split 165 + * KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE worth of huge pages. It 166 + * is used to allocate stage2 page tables while splitting huge 167 + * pages. The choice of KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 168 + * influences both the capacity of the split page cache, and 169 + * how often KVM reschedules. Be wary of raising CHUNK_SIZE 170 + * too high. 171 + * 172 + * Protected by kvm->slots_lock. 173 + */ 174 + struct kvm_mmu_memory_cache split_page_cache; 175 + uint64_t split_page_chunk_size; 176 + 163 177 struct kvm_arch *arch; 164 178 }; 165 179 ··· 230 214 #define KVM_ARCH_FLAG_MTE_ENABLED 1 231 215 /* At least one vCPU has ran in the VM */ 232 216 #define KVM_ARCH_FLAG_HAS_RAN_ONCE 2 233 - /* 234 - * The following two bits are used to indicate the guest's EL1 235 - * register width configuration. A value of KVM_ARCH_FLAG_EL1_32BIT 236 - * bit is valid only when KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED is set. 237 - * Otherwise, the guest's EL1 register width has not yet been 238 - * determined yet. 239 - */ 240 - #define KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED 3 241 - #define KVM_ARCH_FLAG_EL1_32BIT 4 217 + /* The vCPU feature set for the VM is configured */ 218 + #define KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED 3 242 219 /* PSCI SYSTEM_SUSPEND enabled for the guest */ 243 - #define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED 5 220 + #define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED 4 244 221 /* VM counter offset */ 245 - #define KVM_ARCH_FLAG_VM_COUNTER_OFFSET 6 222 + #define KVM_ARCH_FLAG_VM_COUNTER_OFFSET 5 246 223 /* Timer PPIs made immutable */ 247 - #define KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE 7 224 + #define KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE 6 248 225 /* SMCCC filter initialized for the VM */ 249 - #define KVM_ARCH_FLAG_SMCCC_FILTER_CONFIGURED 8 226 + #define KVM_ARCH_FLAG_SMCCC_FILTER_CONFIGURED 7 227 + /* Initial ID reg values loaded */ 228 + #define KVM_ARCH_FLAG_ID_REGS_INITIALIZED 8 250 229 unsigned long flags; 230 + 231 + /* VM-wide vCPU feature set */ 232 + DECLARE_BITMAP(vcpu_features, KVM_VCPU_MAX_FEATURES); 251 233 252 234 /* 253 235 * VM-wide PMU filter, implemented as a bitmap and big enough for ··· 256 242 257 243 cpumask_var_t supported_cpus; 258 244 259 - u8 pfr0_csv2; 260 - u8 pfr0_csv3; 261 - struct { 262 - u8 imp:4; 263 - u8 unimp:4; 264 - } dfr0_pmuver; 265 - 266 245 /* Hypercall features firmware registers' descriptor */ 267 246 struct kvm_smccc_features smccc_feat; 268 247 struct maple_tree smccc_filter; 248 + 249 + /* 250 + * Emulated CPU ID registers per VM 251 + * (Op0, Op1, CRn, CRm, Op2) of the ID registers to be saved in it 252 + * is (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8. 253 + * 254 + * These emulated idregs are VM-wide, but accessed from the context of a vCPU. 255 + * Atomic access to multiple idregs are guarded by kvm_arch.config_lock. 256 + */ 257 + #define IDREG_IDX(id) (((sys_reg_CRm(id) - 1) << 3) | sys_reg_Op2(id)) 258 + #define IDREG(kvm, id) ((kvm)->arch.id_regs[IDREG_IDX(id)]) 259 + #define KVM_ARM_ID_REG_NUM (IDREG_IDX(sys_reg(3, 0, 0, 7, 7)) + 1) 260 + u64 id_regs[KVM_ARM_ID_REG_NUM]; 269 261 270 262 /* 271 263 * For an untrusted host VM, 'pkvm.handle' is used to lookup ··· 425 405 struct kvm_host_psci_config { 426 406 /* PSCI version used by host. */ 427 407 u32 version; 408 + u32 smccc_version; 428 409 429 410 /* Function IDs used by host if version is v0.1. */ 430 411 struct psci_0_1_function_ids function_ids_0_1;

+28 -9

arch/arm64/include/asm/kvm_hyp.h

··· 16 16 DECLARE_PER_CPU(unsigned long, kvm_hyp_vector); 17 17 DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); 18 18 19 + /* 20 + * Unified accessors for registers that have a different encoding 21 + * between VHE and non-VHE. They must be specified without their "ELx" 22 + * encoding, but with the SYS_ prefix, as defined in asm/sysreg.h. 23 + */ 24 + 25 + #if defined(__KVM_VHE_HYPERVISOR__) 26 + 27 + #define read_sysreg_el0(r) read_sysreg_s(r##_EL02) 28 + #define write_sysreg_el0(v,r) write_sysreg_s(v, r##_EL02) 29 + #define read_sysreg_el1(r) read_sysreg_s(r##_EL12) 30 + #define write_sysreg_el1(v,r) write_sysreg_s(v, r##_EL12) 31 + #define read_sysreg_el2(r) read_sysreg_s(r##_EL1) 32 + #define write_sysreg_el2(v,r) write_sysreg_s(v, r##_EL1) 33 + 34 + #else // !__KVM_VHE_HYPERVISOR__ 35 + 36 + #if defined(__KVM_NVHE_HYPERVISOR__) 37 + #define VHE_ALT_KEY ARM64_KVM_HVHE 38 + #else 39 + #define VHE_ALT_KEY ARM64_HAS_VIRT_HOST_EXTN 40 + #endif 41 + 19 42 #define read_sysreg_elx(r,nvh,vh) \ 20 43 ({ \ 21 44 u64 reg; \ 22 - asm volatile(ALTERNATIVE(__mrs_s("%0", r##nvh), \ 45 + asm volatile(ALTERNATIVE(__mrs_s("%0", r##nvh), \ 23 46 __mrs_s("%0", r##vh), \ 24 - ARM64_HAS_VIRT_HOST_EXTN) \ 47 + VHE_ALT_KEY) \ 25 48 : "=r" (reg)); \ 26 49 reg; \ 27 50 }) ··· 54 31 u64 __val = (u64)(v); \ 55 32 asm volatile(ALTERNATIVE(__msr_s(r##nvh, "%x0"), \ 56 33 __msr_s(r##vh, "%x0"), \ 57 - ARM64_HAS_VIRT_HOST_EXTN) \ 34 + VHE_ALT_KEY) \ 58 35 : : "rZ" (__val)); \ 59 36 } while (0) 60 - 61 - /* 62 - * Unified accessors for registers that have a different encoding 63 - * between VHE and non-VHE. They must be specified without their "ELx" 64 - * encoding, but with the SYS_ prefix, as defined in asm/sysreg.h. 65 - */ 66 37 67 38 #define read_sysreg_el0(r) read_sysreg_elx(r, _EL0, _EL02) 68 39 #define write_sysreg_el0(v,r) write_sysreg_elx(v, r, _EL0, _EL02) ··· 64 47 #define write_sysreg_el1(v,r) write_sysreg_elx(v, r, _EL1, _EL12) 65 48 #define read_sysreg_el2(r) read_sysreg_elx(r, _EL2, _EL1) 66 49 #define write_sysreg_el2(v,r) write_sysreg_elx(v, r, _EL2, _EL1) 50 + 51 + #endif // __KVM_VHE_HYPERVISOR__ 67 52 68 53 /* 69 54 * Without an __arch_swab32(), we fall back to ___constant_swab32(), but the

+3 -1

arch/arm64/include/asm/kvm_mmu.h

··· 172 172 173 173 void stage2_unmap_vm(struct kvm *kvm); 174 174 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type); 175 + void kvm_uninit_stage2_mmu(struct kvm *kvm); 175 176 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu); 176 177 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 177 178 phys_addr_t pa, unsigned long size, bool writable); ··· 228 227 if (icache_is_aliasing()) { 229 228 /* any kind of VIPT cache */ 230 229 icache_inval_all_pou(); 231 - } else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) { 230 + } else if (read_sysreg(CurrentEL) != CurrentEL_EL1 || 231 + !icache_is_vpipt()) { 232 232 /* PIPT or VPIPT at EL2 (see comment in __kvm_tlb_flush_vmid_ipa) */ 233 233 icache_inval_pou((unsigned long)va, (unsigned long)va + size); 234 234 }

+75 -4

arch/arm64/include/asm/kvm_pgtable.h

··· 92 92 return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL; 93 93 } 94 94 95 + static inline u32 kvm_supported_block_sizes(void) 96 + { 97 + u32 level = KVM_PGTABLE_MIN_BLOCK_LEVEL; 98 + u32 r = 0; 99 + 100 + for (; level < KVM_PGTABLE_MAX_LEVELS; level++) 101 + r |= BIT(kvm_granule_shift(level)); 102 + 103 + return r; 104 + } 105 + 106 + static inline bool kvm_is_block_size_supported(u64 size) 107 + { 108 + bool is_power_of_two = IS_ALIGNED(size, size); 109 + 110 + return is_power_of_two && (size & kvm_supported_block_sizes()); 111 + } 112 + 95 113 /** 96 114 * struct kvm_pgtable_mm_ops - Memory management callbacks. 97 115 * @zalloc_page: Allocate a single zeroed memory page. ··· 122 104 * allocation is physically contiguous. 123 105 * @free_pages_exact: Free an exact number of memory pages previously 124 106 * allocated by zalloc_pages_exact. 125 - * @free_removed_table: Free a removed paging structure by unlinking and 107 + * @free_unlinked_table: Free an unlinked paging structure by unlinking and 126 108 * dropping references. 127 109 * @get_page: Increment the refcount on a page. 128 110 * @put_page: Decrement the refcount on a page. When the ··· 142 124 void* (*zalloc_page)(void *arg); 143 125 void* (*zalloc_pages_exact)(size_t size); 144 126 void (*free_pages_exact)(void *addr, size_t size); 145 - void (*free_removed_table)(void *addr, u32 level); 127 + void (*free_unlinked_table)(void *addr, u32 level); 146 128 void (*get_page)(void *addr); 147 129 void (*put_page)(void *addr); 148 130 int (*page_count)(void *addr); ··· 213 195 * with other software walkers. 214 196 * @KVM_PGTABLE_WALK_HANDLE_FAULT: Indicates the page-table walk was 215 197 * invoked from a fault handler. 198 + * @KVM_PGTABLE_WALK_SKIP_BBM_TLBI: Visit and update table entries 199 + * without Break-before-make's 200 + * TLB invalidation. 201 + * @KVM_PGTABLE_WALK_SKIP_CMO: Visit and update table entries 202 + * without Cache maintenance 203 + * operations required. 216 204 */ 217 205 enum kvm_pgtable_walk_flags { 218 206 KVM_PGTABLE_WALK_LEAF = BIT(0), ··· 226 202 KVM_PGTABLE_WALK_TABLE_POST = BIT(2), 227 203 KVM_PGTABLE_WALK_SHARED = BIT(3), 228 204 KVM_PGTABLE_WALK_HANDLE_FAULT = BIT(4), 205 + KVM_PGTABLE_WALK_SKIP_BBM_TLBI = BIT(5), 206 + KVM_PGTABLE_WALK_SKIP_CMO = BIT(6), 229 207 }; 230 208 231 209 struct kvm_pgtable_visit_ctx { ··· 467 441 void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); 468 442 469 443 /** 470 - * kvm_pgtable_stage2_free_removed() - Free a removed stage-2 paging structure. 444 + * kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure. 471 445 * @mm_ops: Memory management callbacks. 472 446 * @pgtable: Unlinked stage-2 paging structure to be freed. 473 447 * @level: Level of the stage-2 paging structure to be freed. ··· 475 449 * The page-table is assumed to be unreachable by any hardware walkers prior to 476 450 * freeing and therefore no TLB invalidation is performed. 477 451 */ 478 - void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level); 452 + void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level); 453 + 454 + /** 455 + * kvm_pgtable_stage2_create_unlinked() - Create an unlinked stage-2 paging structure. 456 + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). 457 + * @phys: Physical address of the memory to map. 458 + * @level: Starting level of the stage-2 paging structure to be created. 459 + * @prot: Permissions and attributes for the mapping. 460 + * @mc: Cache of pre-allocated and zeroed memory from which to allocate 461 + * page-table pages. 462 + * @force_pte: Force mappings to PAGE_SIZE granularity. 463 + * 464 + * Returns an unlinked page-table tree. This new page-table tree is 465 + * not reachable (i.e., it is unlinked) from the root pgd and it's 466 + * therefore unreachableby the hardware page-table walker. No TLB 467 + * invalidation or CMOs are performed. 468 + * 469 + * If device attributes are not explicitly requested in @prot, then the 470 + * mapping will be normal, cacheable. 471 + * 472 + * Return: The fully populated (unlinked) stage-2 paging structure, or 473 + * an ERR_PTR(error) on failure. 474 + */ 475 + kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, 476 + u64 phys, u32 level, 477 + enum kvm_pgtable_prot prot, 478 + void *mc, bool force_pte); 479 479 480 480 /** 481 481 * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table. ··· 671 619 * Return: 0 on success, negative error code on failure. 672 620 */ 673 621 int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size); 622 + 623 + /** 624 + * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing 625 + * to PAGE_SIZE guest pages. 626 + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). 627 + * @addr: Intermediate physical address from which to split. 628 + * @size: Size of the range. 629 + * @mc: Cache of pre-allocated and zeroed memory from which to allocate 630 + * page-table pages. 631 + * 632 + * The function tries to split any level 1 or 2 entry that overlaps 633 + * with the input range (given by @addr and @size). 634 + * 635 + * Return: 0 on success, negative error code on failure. Note that 636 + * kvm_pgtable_stage2_split() is best effort: it tries to break as many 637 + * blocks in the input range as allowed by @mc_capacity. 638 + */ 639 + int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, 640 + struct kvm_mmu_memory_cache *mc); 674 641 675 642 /** 676 643 * kvm_pgtable_walk() - Walk a page-table.

+21

arch/arm64/include/asm/kvm_pkvm.h

··· 6 6 #ifndef __ARM64_KVM_PKVM_H__ 7 7 #define __ARM64_KVM_PKVM_H__ 8 8 9 + #include <linux/arm_ffa.h> 9 10 #include <linux/memblock.h> 11 + #include <linux/scatterlist.h> 10 12 #include <asm/kvm_pgtable.h> 11 13 12 14 /* Maximum number of VMs that can co-exist under pKVM. */ ··· 106 104 res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT); 107 105 108 106 return res; 107 + } 108 + 109 + #define KVM_FFA_MBOX_NR_PAGES 1 110 + 111 + static inline unsigned long hyp_ffa_proxy_pages(void) 112 + { 113 + size_t desc_max; 114 + 115 + /* 116 + * The hypervisor FFA proxy needs enough memory to buffer a fragmented 117 + * descriptor returned from EL3 in response to a RETRIEVE_REQ call. 118 + */ 119 + desc_max = sizeof(struct ffa_mem_region) + 120 + sizeof(struct ffa_mem_region_attributes) + 121 + sizeof(struct ffa_composite_mem_region) + 122 + SG_MAX_SEGMENTS * sizeof(struct ffa_mem_region_addr_range); 123 + 124 + /* Plus a page each for the hypervisor's RX and TX mailboxes. */ 125 + return (2 * KVM_FFA_MBOX_NR_PAGES) + DIV_ROUND_UP(desc_max, PAGE_SIZE); 109 126 } 110 127 111 128 #endif /* __ARM64_KVM_PKVM_H__ */

+9 -7

arch/arm64/include/asm/memory.h

··· 46 46 #define KIMAGE_VADDR (MODULES_END) 47 47 #define MODULES_END (MODULES_VADDR + MODULES_VSIZE) 48 48 #define MODULES_VADDR (_PAGE_END(VA_BITS_MIN)) 49 - #define MODULES_VSIZE (SZ_128M) 49 + #define MODULES_VSIZE (SZ_2G) 50 50 #define VMEMMAP_START (-(UL(1) << (VA_BITS - VMEMMAP_SHIFT))) 51 51 #define VMEMMAP_END (VMEMMAP_START + VMEMMAP_SIZE) 52 52 #define PCI_IO_END (VMEMMAP_START - SZ_8M) ··· 204 204 return kimage_vaddr - KIMAGE_VADDR; 205 205 } 206 206 207 + #ifdef CONFIG_RANDOMIZE_BASE 208 + void kaslr_init(void); 207 209 static inline bool kaslr_enabled(void) 208 210 { 209 - /* 210 - * The KASLR offset modulo MIN_KIMG_ALIGN is taken from the physical 211 - * placement of the image rather than from the seed, so a displacement 212 - * of less than MIN_KIMG_ALIGN means that no seed was provided. 213 - */ 214 - return kaslr_offset() >= MIN_KIMG_ALIGN; 211 + extern bool __kaslr_is_enabled; 212 + return __kaslr_is_enabled; 215 213 } 214 + #else 215 + static inline void kaslr_init(void) { } 216 + static inline bool kaslr_enabled(void) { return false; } 217 + #endif 216 218 217 219 /* 218 220 * Allow all memory at the discovery stage. We will clip it later.

-8

arch/arm64/include/asm/module.h

··· 7 7 8 8 #include <asm-generic/module.h> 9 9 10 - #ifdef CONFIG_ARM64_MODULE_PLTS 11 10 struct mod_plt_sec { 12 11 int plt_shndx; 13 12 int plt_num_entries; ··· 20 21 /* for CONFIG_DYNAMIC_FTRACE */ 21 22 struct plt_entry *ftrace_trampolines; 22 23 }; 23 - #endif 24 24 25 25 u64 module_emit_plt_entry(struct module *mod, Elf64_Shdr *sechdrs, 26 26 void *loc, const Elf64_Rela *rela, ··· 27 29 28 30 u64 module_emit_veneer_for_adrp(struct module *mod, Elf64_Shdr *sechdrs, 29 31 void *loc, u64 val); 30 - 31 - #ifdef CONFIG_RANDOMIZE_BASE 32 - extern u64 module_alloc_base; 33 - #else 34 - #define module_alloc_base ((u64)_etext - MODULES_VSIZE) 35 - #endif 36 32 37 33 struct plt_entry { 38 34 /*

-2

arch/arm64/include/asm/module.lds.h

··· 1 1 SECTIONS { 2 - #ifdef CONFIG_ARM64_MODULE_PLTS 3 2 .plt 0 : { BYTE(0) } 4 3 .init.plt 0 : { BYTE(0) } 5 4 .text.ftrace_trampoline 0 : { BYTE(0) } 6 - #endif 7 5 8 6 #ifdef CONFIG_KASAN_SW_TAGS 9 7 /*

+1

arch/arm64/include/asm/sysreg.h

··· 564 564 (BIT(18)) | (BIT(22)) | (BIT(23)) | (BIT(28)) | \ 565 565 (BIT(29))) 566 566 567 + #define SCTLR_EL2_BT (BIT(36)) 567 568 #ifdef CONFIG_CPU_BIG_ENDIAN 568 569 #define ENDIAN_SET_EL2 SCTLR_ELx_EE 569 570 #else

+11 -1

arch/arm64/include/asm/virt.h

··· 110 110 return __boot_cpu_mode[0] != __boot_cpu_mode[1]; 111 111 } 112 112 113 - static inline bool is_kernel_in_hyp_mode(void) 113 + static __always_inline bool is_kernel_in_hyp_mode(void) 114 114 { 115 + BUILD_BUG_ON(__is_defined(__KVM_NVHE_HYPERVISOR__) || 116 + __is_defined(__KVM_VHE_HYPERVISOR__)); 115 117 return read_sysreg(CurrentEL) == CurrentEL_EL2; 116 118 } 117 119 ··· 140 138 return false; 141 139 else 142 140 return cpus_have_final_cap(ARM64_KVM_PROTECTED_MODE); 141 + } 142 + 143 + static __always_inline bool has_hvhe(void) 144 + { 145 + if (is_vhe_hyp_code()) 146 + return false; 147 + 148 + return cpus_have_final_cap(ARM64_KVM_HVHE); 143 149 } 144 150 145 151 static inline bool is_hyp_nvhe(void)

+1 -2

arch/arm64/kernel/Makefile

··· 42 42 obj-$(CONFIG_COMPAT_ALIGNMENT_FIXUPS) += compat_alignment.o 43 43 obj-$(CONFIG_KUSER_HELPERS) += kuser32.o 44 44 obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o 45 - obj-$(CONFIG_MODULES) += module.o 46 - obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o 45 + obj-$(CONFIG_MODULES) += module.o module-plts.o 47 46 obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o 48 47 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 49 48 obj-$(CONFIG_CPU_PM) += sleep.o suspend.o

+7

arch/arm64/kernel/cpu_errata.c

··· 730 730 .cpu_enable = cpu_clear_bf16_from_user_emulation, 731 731 }, 732 732 #endif 733 + #ifdef CONFIG_AMPERE_ERRATUM_AC03_CPU_38 734 + { 735 + .desc = "AmpereOne erratum AC03_CPU_38", 736 + .capability = ARM64_WORKAROUND_AMPERE_AC03_CPU_38, 737 + ERRATA_MIDR_ALL_VERSIONS(MIDR_AMPERE1), 738 + }, 739 + #endif 733 740 { 734 741 } 735 742 };

+33 -1

arch/arm64/kernel/cpufeature.c

··· 664 664 struct arm64_ftr_override __ro_after_init id_aa64isar1_override; 665 665 struct arm64_ftr_override __ro_after_init id_aa64isar2_override; 666 666 667 + struct arm64_ftr_override arm64_sw_feature_override; 668 + 667 669 static const struct __ftr_reg_entry { 668 670 u32 sys_id; 669 671 struct arm64_ftr_reg *reg; ··· 800 798 return reg; 801 799 } 802 800 803 - static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, 801 + s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, 804 802 s64 cur) 805 803 { 806 804 s64 ret = 0; ··· 1998 1996 return true; 1999 1997 } 2000 1998 1999 + static bool hvhe_possible(const struct arm64_cpu_capabilities *entry, 2000 + int __unused) 2001 + { 2002 + u64 val; 2003 + 2004 + val = read_sysreg(id_aa64mmfr1_el1); 2005 + if (!cpuid_feature_extract_unsigned_field(val, ID_AA64MMFR1_EL1_VH_SHIFT)) 2006 + return false; 2007 + 2008 + val = arm64_sw_feature_override.val & arm64_sw_feature_override.mask; 2009 + return cpuid_feature_extract_unsigned_field(val, ARM64_SW_FEATURE_OVERRIDE_HVHE); 2010 + } 2011 + 2001 2012 #ifdef CONFIG_ARM64_PAN 2002 2013 static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused) 2003 2014 { ··· 2655 2640 .matches = has_cpuid_feature, 2656 2641 .cpu_enable = cpu_enable_dit, 2657 2642 ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, DIT, IMP) 2643 + }, 2644 + { 2645 + .desc = "VHE for hypervisor only", 2646 + .capability = ARM64_KVM_HVHE, 2647 + .type = ARM64_CPUCAP_SYSTEM_FEATURE, 2648 + .matches = hvhe_possible, 2649 + }, 2650 + { 2651 + .desc = "Enhanced Virtualization Traps", 2652 + .capability = ARM64_HAS_EVT, 2653 + .type = ARM64_CPUCAP_SYSTEM_FEATURE, 2654 + .sys_reg = SYS_ID_AA64MMFR2_EL1, 2655 + .sign = FTR_UNSIGNED, 2656 + .field_pos = ID_AA64MMFR2_EL1_EVT_SHIFT, 2657 + .field_width = 4, 2658 + .min_field_value = ID_AA64MMFR2_EL1_EVT_IMP, 2659 + .matches = has_cpuid_feature, 2658 2660 }, 2659 2661 {}, 2660 2662 };

+3 -5

arch/arm64/kernel/ftrace.c

··· 197 197 198 198 static struct plt_entry *get_ftrace_plt(struct module *mod) 199 199 { 200 - #ifdef CONFIG_ARM64_MODULE_PLTS 200 + #ifdef CONFIG_MODULES 201 201 struct plt_entry *plt = mod->arch.ftrace_trampolines; 202 202 203 203 return &plt[FTRACE_PLT_IDX]; ··· 249 249 * must use a PLT to reach it. We can only place PLTs for modules, and 250 250 * only when module PLT support is built-in. 251 251 */ 252 - if (!IS_ENABLED(CONFIG_ARM64_MODULE_PLTS)) 252 + if (!IS_ENABLED(CONFIG_MODULES)) 253 253 return false; 254 254 255 255 /* ··· 431 431 * 432 432 * Note: 'mod' is only set at module load time. 433 433 */ 434 - if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_ARGS) && 435 - IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && mod) { 434 + if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_ARGS) && mod) 436 435 return aarch64_insn_patch_text_nosync((void *)pc, new); 437 - } 438 436 439 437 if (!ftrace_find_callable_addr(rec, mod, &addr)) 440 438 return -EINVAL;

+2

arch/arm64/kernel/head.S

··· 603 603 msr sctlr_el1, x1 604 604 mov x2, xzr 605 605 2: 606 + __init_el2_nvhe_prepare_eret 607 + 606 608 mov w0, #BOOT_CPU_MODE_EL2 607 609 orr x0, x0, x2 608 610 eret

+9 -1

arch/arm64/kernel/hyp-stub.S

··· 82 82 tbnz x1, #0, 1f 83 83 84 84 // Needs to be VHE capable, obviously 85 - check_override id_aa64mmfr1 ID_AA64MMFR1_EL1_VH_SHIFT 2f 1f x1 x2 85 + check_override id_aa64mmfr1 ID_AA64MMFR1_EL1_VH_SHIFT 0f 1f x1 x2 86 + 87 + 0: // Check whether we only want the hypervisor to run VHE, not the kernel 88 + adr_l x1, arm64_sw_feature_override 89 + ldr x2, [x1, FTR_OVR_VAL_OFFSET] 90 + ldr x1, [x1, FTR_OVR_MASK_OFFSET] 91 + and x2, x2, x1 92 + ubfx x2, x2, #ARM64_SW_FEATURE_OVERRIDE_HVHE, #4 93 + cbz x2, 2f 86 94 87 95 1: mov_q x0, HVC_STUB_ERR 88 96 eret

+16 -9

arch/arm64/kernel/idreg-override.c

··· 138 138 }, 139 139 }; 140 140 141 - extern struct arm64_ftr_override kaslr_feature_override; 141 + static bool __init hvhe_filter(u64 val) 142 + { 143 + u64 mmfr1 = read_sysreg(id_aa64mmfr1_el1); 142 144 143 - static const struct ftr_set_desc kaslr __initconst = { 144 - .name = "kaslr", 145 - #ifdef CONFIG_RANDOMIZE_BASE 146 - .override = &kaslr_feature_override, 147 - #endif 145 + return (val == 1 && 146 + lower_32_bits(__boot_status) == BOOT_CPU_MODE_EL2 && 147 + cpuid_feature_extract_unsigned_field(mmfr1, 148 + ID_AA64MMFR1_EL1_VH_SHIFT)); 149 + } 150 + 151 + static const struct ftr_set_desc sw_features __initconst = { 152 + .name = "arm64_sw", 153 + .override = &arm64_sw_feature_override, 148 154 .fields = { 149 - FIELD("disabled", 0, NULL), 155 + FIELD("nokaslr", ARM64_SW_FEATURE_OVERRIDE_NOKASLR, NULL), 156 + FIELD("hvhe", ARM64_SW_FEATURE_OVERRIDE_HVHE, hvhe_filter), 150 157 {} 151 158 }, 152 159 }; ··· 165 158 &isar1, 166 159 &isar2, 167 160 &smfr0, 168 - &kaslr, 161 + &sw_features, 169 162 }; 170 163 171 164 static const struct { ··· 182 175 "id_aa64isar1.api=0 id_aa64isar1.apa=0 " 183 176 "id_aa64isar2.gpa3=0 id_aa64isar2.apa3=0" }, 184 177 { "arm64.nomte", "id_aa64pfr1.mte=0" }, 185 - { "nokaslr", "kaslr.disabled=1" }, 178 + { "nokaslr", "arm64_sw.nokaslr=1" }, 186 179 }; 187 180 188 181 static int __init parse_nokaslr(char *unused)

+16 -71

arch/arm64/kernel/kaslr.c

··· 4 4 */ 5 5 6 6 #include <linux/cache.h> 7 - #include <linux/crc32.h> 8 7 #include <linux/init.h> 9 - #include <linux/libfdt.h> 10 - #include <linux/mm_types.h> 11 - #include <linux/sched.h> 12 - #include <linux/types.h> 13 - #include <linux/pgtable.h> 14 - #include <linux/random.h> 8 + #include <linux/printk.h> 15 9 16 - #include <asm/fixmap.h> 17 - #include <asm/kernel-pgtable.h> 10 + #include <asm/cpufeature.h> 18 11 #include <asm/memory.h> 19 - #include <asm/mmu.h> 20 - #include <asm/sections.h> 21 - #include <asm/setup.h> 22 12 23 - u64 __ro_after_init module_alloc_base; 24 13 u16 __initdata memstart_offset_seed; 25 14 26 - struct arm64_ftr_override kaslr_feature_override __initdata; 15 + bool __ro_after_init __kaslr_is_enabled = false; 27 16 28 - static int __init kaslr_init(void) 17 + void __init kaslr_init(void) 29 18 { 30 - u64 module_range; 31 - u32 seed; 32 - 33 - /* 34 - * Set a reasonable default for module_alloc_base in case 35 - * we end up running with module randomization disabled. 36 - */ 37 - module_alloc_base = (u64)_etext - MODULES_VSIZE; 38 - 39 - if (kaslr_feature_override.val & kaslr_feature_override.mask & 0xf) { 19 + if (cpuid_feature_extract_unsigned_field(arm64_sw_feature_override.val & 20 + arm64_sw_feature_override.mask, 21 + ARM64_SW_FEATURE_OVERRIDE_NOKASLR)) { 40 22 pr_info("KASLR disabled on command line\n"); 41 - return 0; 23 + return; 42 24 } 43 25 44 - if (!kaslr_enabled()) { 26 + /* 27 + * The KASLR offset modulo MIN_KIMG_ALIGN is taken from the physical 28 + * placement of the image rather than from the seed, so a displacement 29 + * of less than MIN_KIMG_ALIGN means that no seed was provided. 30 + */ 31 + if (kaslr_offset() < MIN_KIMG_ALIGN) { 45 32 pr_warn("KASLR disabled due to lack of seed\n"); 46 - return 0; 33 + return; 47 34 } 48 35 49 36 pr_info("KASLR enabled\n"); 50 - 51 - /* 52 - * KASAN without KASAN_VMALLOC does not expect the module region to 53 - * intersect the vmalloc region, since shadow memory is allocated for 54 - * each module at load time, whereas the vmalloc region will already be 55 - * shadowed by KASAN zero pages. 56 - */ 57 - BUILD_BUG_ON((IS_ENABLED(CONFIG_KASAN_GENERIC) || 58 - IS_ENABLED(CONFIG_KASAN_SW_TAGS)) && 59 - !IS_ENABLED(CONFIG_KASAN_VMALLOC)); 60 - 61 - seed = get_random_u32(); 62 - 63 - if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) { 64 - /* 65 - * Randomize the module region over a 2 GB window covering the 66 - * kernel. This reduces the risk of modules leaking information 67 - * about the address of the kernel itself, but results in 68 - * branches between modules and the core kernel that are 69 - * resolved via PLTs. (Branches between modules will be 70 - * resolved normally.) 71 - */ 72 - module_range = SZ_2G - (u64)(_end - _stext); 73 - module_alloc_base = max((u64)_end - SZ_2G, (u64)MODULES_VADDR); 74 - } else { 75 - /* 76 - * Randomize the module region by setting module_alloc_base to 77 - * a PAGE_SIZE multiple in the range [_etext - MODULES_VSIZE, 78 - * _stext) . This guarantees that the resulting region still 79 - * covers [_stext, _etext], and that all relative branches can 80 - * be resolved without veneers unless this region is exhausted 81 - * and we fall back to a larger 2GB window in module_alloc() 82 - * when ARM64_MODULE_PLTS is enabled. 83 - */ 84 - module_range = MODULES_VSIZE - (u64)(_etext - _stext); 85 - } 86 - 87 - /* use the lower 21 bits to randomize the base of the module region */ 88 - module_alloc_base += (module_range * (seed & ((1 << 21) - 1))) >> 21; 89 - module_alloc_base &= PAGE_MASK; 90 - 91 - return 0; 37 + __kaslr_is_enabled = true; 92 38 } 93 - subsys_initcall(kaslr_init)

+116 -37

arch/arm64/kernel/module.c

··· 7 7 * Author: Will Deacon <will.deacon@arm.com> 8 8 */ 9 9 10 + #define pr_fmt(fmt) "Modules: " fmt 11 + 10 12 #include <linux/bitops.h> 11 13 #include <linux/elf.h> 12 14 #include <linux/ftrace.h> ··· 17 15 #include <linux/kernel.h> 18 16 #include <linux/mm.h> 19 17 #include <linux/moduleloader.h> 18 + #include <linux/random.h> 20 19 #include <linux/scs.h> 21 20 #include <linux/vmalloc.h> 21 + 22 22 #include <asm/alternative.h> 23 23 #include <asm/insn.h> 24 24 #include <asm/scs.h> 25 25 #include <asm/sections.h> 26 26 27 + static u64 module_direct_base __ro_after_init = 0; 28 + static u64 module_plt_base __ro_after_init = 0; 29 + 30 + /* 31 + * Choose a random page-aligned base address for a window of 'size' bytes which 32 + * entirely contains the interval [start, end - 1]. 33 + */ 34 + static u64 __init random_bounding_box(u64 size, u64 start, u64 end) 35 + { 36 + u64 max_pgoff, pgoff; 37 + 38 + if ((end - start) >= size) 39 + return 0; 40 + 41 + max_pgoff = (size - (end - start)) / PAGE_SIZE; 42 + pgoff = get_random_u32_inclusive(0, max_pgoff); 43 + 44 + return start - pgoff * PAGE_SIZE; 45 + } 46 + 47 + /* 48 + * Modules may directly reference data and text anywhere within the kernel 49 + * image and other modules. References using PREL32 relocations have a +/-2G 50 + * range, and so we need to ensure that the entire kernel image and all modules 51 + * fall within a 2G window such that these are always within range. 52 + * 53 + * Modules may directly branch to functions and code within the kernel text, 54 + * and to functions and code within other modules. These branches will use 55 + * CALL26/JUMP26 relocations with a +/-128M range. Without PLTs, we must ensure 56 + * that the entire kernel text and all module text falls within a 128M window 57 + * such that these are always within range. With PLTs, we can expand this to a 58 + * 2G window. 59 + * 60 + * We chose the 128M region to surround the entire kernel image (rather than 61 + * just the text) as using the same bounds for the 128M and 2G regions ensures 62 + * by construction that we never select a 128M region that is not a subset of 63 + * the 2G region. For very large and unusual kernel configurations this means 64 + * we may fall back to PLTs where they could have been avoided, but this keeps 65 + * the logic significantly simpler. 66 + */ 67 + static int __init module_init_limits(void) 68 + { 69 + u64 kernel_end = (u64)_end; 70 + u64 kernel_start = (u64)_text; 71 + u64 kernel_size = kernel_end - kernel_start; 72 + 73 + /* 74 + * The default modules region is placed immediately below the kernel 75 + * image, and is large enough to use the full 2G relocation range. 76 + */ 77 + BUILD_BUG_ON(KIMAGE_VADDR != MODULES_END); 78 + BUILD_BUG_ON(MODULES_VSIZE < SZ_2G); 79 + 80 + if (!kaslr_enabled()) { 81 + if (kernel_size < SZ_128M) 82 + module_direct_base = kernel_end - SZ_128M; 83 + if (kernel_size < SZ_2G) 84 + module_plt_base = kernel_end - SZ_2G; 85 + } else { 86 + u64 min = kernel_start; 87 + u64 max = kernel_end; 88 + 89 + if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) { 90 + pr_info("2G module region forced by RANDOMIZE_MODULE_REGION_FULL\n"); 91 + } else { 92 + module_direct_base = random_bounding_box(SZ_128M, min, max); 93 + if (module_direct_base) { 94 + min = module_direct_base; 95 + max = module_direct_base + SZ_128M; 96 + } 97 + } 98 + 99 + module_plt_base = random_bounding_box(SZ_2G, min, max); 100 + } 101 + 102 + pr_info("%llu pages in range for non-PLT usage", 103 + module_direct_base ? (SZ_128M - kernel_size) / PAGE_SIZE : 0); 104 + pr_info("%llu pages in range for PLT usage", 105 + module_plt_base ? (SZ_2G - kernel_size) / PAGE_SIZE : 0); 106 + 107 + return 0; 108 + } 109 + subsys_initcall(module_init_limits); 110 + 27 111 void *module_alloc(unsigned long size) 28 112 { 29 - u64 module_alloc_end = module_alloc_base + MODULES_VSIZE; 30 - gfp_t gfp_mask = GFP_KERNEL; 31 - void *p; 113 + void *p = NULL; 32 114 33 - /* Silence the initial allocation */ 34 - if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS)) 35 - gfp_mask |= __GFP_NOWARN; 115 + /* 116 + * Where possible, prefer to allocate within direct branch range of the 117 + * kernel such that no PLTs are necessary. 118 + */ 119 + if (module_direct_base) { 120 + p = __vmalloc_node_range(size, MODULE_ALIGN, 121 + module_direct_base, 122 + module_direct_base + SZ_128M, 123 + GFP_KERNEL | __GFP_NOWARN, 124 + PAGE_KERNEL, 0, NUMA_NO_NODE, 125 + __builtin_return_address(0)); 126 + } 36 127 37 - if (IS_ENABLED(CONFIG_KASAN_GENERIC) || 38 - IS_ENABLED(CONFIG_KASAN_SW_TAGS)) 39 - /* don't exceed the static module region - see below */ 40 - module_alloc_end = MODULES_END; 128 + if (!p && module_plt_base) { 129 + p = __vmalloc_node_range(size, MODULE_ALIGN, 130 + module_plt_base, 131 + module_plt_base + SZ_2G, 132 + GFP_KERNEL | __GFP_NOWARN, 133 + PAGE_KERNEL, 0, NUMA_NO_NODE, 134 + __builtin_return_address(0)); 135 + } 41 136 42 - p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, 43 - module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK, 44 - NUMA_NO_NODE, __builtin_return_address(0)); 137 + if (!p) { 138 + pr_warn_ratelimited("%s: unable to allocate memory\n", 139 + __func__); 140 + } 45 141 46 - if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && 47 - (IS_ENABLED(CONFIG_KASAN_VMALLOC) || 48 - (!IS_ENABLED(CONFIG_KASAN_GENERIC) && 49 - !IS_ENABLED(CONFIG_KASAN_SW_TAGS)))) 50 - /* 51 - * KASAN without KASAN_VMALLOC can only deal with module 52 - * allocations being served from the reserved module region, 53 - * since the remainder of the vmalloc region is already 54 - * backed by zero shadow pages, and punching holes into it 55 - * is non-trivial. Since the module region is not randomized 56 - * when KASAN is enabled without KASAN_VMALLOC, it is even 57 - * less likely that the module region gets exhausted, so we 58 - * can simply omit this fallback in that case. 59 - */ 60 - p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, 61 - module_alloc_base + SZ_2G, GFP_KERNEL, 62 - PAGE_KERNEL, 0, NUMA_NO_NODE, 63 - __builtin_return_address(0)); 64 - 65 - if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { 142 + if (p && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) { 66 143 vfree(p); 67 144 return NULL; 68 145 } ··· 529 448 case R_AARCH64_CALL26: 530 449 ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26, 531 450 AARCH64_INSN_IMM_26); 532 - 533 - if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && 534 - ovf == -ERANGE) { 451 + if (ovf == -ERANGE) { 535 452 val = module_emit_plt_entry(me, sechdrs, loc, &rel[i], sym); 536 453 if (!val) 537 454 return -ENOEXEC; ··· 566 487 const Elf_Shdr *sechdrs, 567 488 struct module *mod) 568 489 { 569 - #if defined(CONFIG_ARM64_MODULE_PLTS) && defined(CONFIG_DYNAMIC_FTRACE) 490 + #if defined(CONFIG_DYNAMIC_FTRACE) 570 491 const Elf_Shdr *s; 571 492 struct plt_entry *plts; 572 493

+2

arch/arm64/kernel/setup.c

··· 296 296 297 297 *cmdline_p = boot_command_line; 298 298 299 + kaslr_init(); 300 + 299 301 /* 300 302 * If know now we are going to need KPTI then use non-global 301 303 * mappings from the start, avoiding the cost of rewriting

+9 -5

arch/arm64/kvm/arch_timer.c

··· 1406 1406 kvm_get_running_vcpus()); 1407 1407 if (err) { 1408 1408 kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); 1409 - goto out_free_irq; 1409 + goto out_free_vtimer_irq; 1410 1410 } 1411 1411 1412 1412 static_branch_enable(&has_gic_active_state); ··· 1422 1422 if (err) { 1423 1423 kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n", 1424 1424 host_ptimer_irq, err); 1425 - return err; 1425 + goto out_free_vtimer_irq; 1426 1426 } 1427 1427 1428 1428 if (has_gic) { ··· 1430 1430 kvm_get_running_vcpus()); 1431 1431 if (err) { 1432 1432 kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); 1433 - goto out_free_irq; 1433 + goto out_free_ptimer_irq; 1434 1434 } 1435 1435 } 1436 1436 ··· 1439 1439 kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n", 1440 1440 info->physical_irq); 1441 1441 err = -ENODEV; 1442 - goto out_free_irq; 1442 + goto out_free_vtimer_irq; 1443 1443 } 1444 1444 1445 1445 return 0; 1446 - out_free_irq: 1446 + 1447 + out_free_ptimer_irq: 1448 + if (info->physical_irq > 0) 1449 + free_percpu_irq(host_ptimer_irq, kvm_get_running_vcpus()); 1450 + out_free_vtimer_irq: 1447 1451 free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus()); 1448 1452 return err; 1449 1453 }

+154 -55

arch/arm64/kvm/arm.c

··· 51 51 DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); 52 52 DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); 53 53 54 + DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); 55 + 54 56 static bool vgic_present; 55 57 56 58 static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); ··· 67 65 struct kvm_enable_cap *cap) 68 66 { 69 67 int r; 68 + u64 new_cap; 70 69 71 70 if (cap->flags) 72 71 return -EINVAL; ··· 92 89 r = 0; 93 90 set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags); 94 91 break; 92 + case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE: 93 + new_cap = cap->args[0]; 94 + 95 + mutex_lock(&kvm->slots_lock); 96 + /* 97 + * To keep things simple, allow changing the chunk 98 + * size only when no memory slots have been created. 99 + */ 100 + if (!kvm_are_all_memslots_empty(kvm)) { 101 + r = -EINVAL; 102 + } else if (new_cap && !kvm_is_block_size_supported(new_cap)) { 103 + r = -EINVAL; 104 + } else { 105 + r = 0; 106 + kvm->arch.mmu.split_page_chunk_size = new_cap; 107 + } 108 + mutex_unlock(&kvm->slots_lock); 109 + break; 95 110 default: 96 111 r = -EINVAL; 97 112 break; ··· 121 100 static int kvm_arm_default_max_vcpus(void) 122 101 { 123 102 return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS; 124 - } 125 - 126 - static void set_default_spectre(struct kvm *kvm) 127 - { 128 - /* 129 - * The default is to expose CSV2 == 1 if the HW isn't affected. 130 - * Although this is a per-CPU feature, we make it global because 131 - * asymmetric systems are just a nuisance. 132 - * 133 - * Userspace can override this as long as it doesn't promise 134 - * the impossible. 135 - */ 136 - if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) 137 - kvm->arch.pfr0_csv2 = 1; 138 - if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) 139 - kvm->arch.pfr0_csv3 = 1; 140 103 } 141 104 142 105 /** ··· 166 161 /* The maximum number of VCPUs is limited by the host's GIC model */ 167 162 kvm->max_vcpus = kvm_arm_default_max_vcpus(); 168 163 169 - set_default_spectre(kvm); 170 164 kvm_arm_init_hypercalls(kvm); 171 165 172 - /* 173 - * Initialise the default PMUver before there is a chance to 174 - * create an actual PMU. 175 - */ 176 - kvm->arch.dfr0_pmuver.imp = kvm_arm_pmu_get_pmuver_limit(); 166 + bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES); 177 167 178 168 return 0; 179 169 ··· 301 301 case KVM_CAP_ARM_PTRAUTH_ADDRESS: 302 302 case KVM_CAP_ARM_PTRAUTH_GENERIC: 303 303 r = system_has_full_ptr_auth(); 304 + break; 305 + case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE: 306 + if (kvm) 307 + r = kvm->arch.mmu.split_page_chunk_size; 308 + else 309 + r = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; 310 + break; 311 + case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES: 312 + r = kvm_supported_block_sizes(); 304 313 break; 305 314 default: 306 315 r = 0; ··· 1176 1167 return -EINVAL; 1177 1168 } 1178 1169 1179 - static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, 1180 - const struct kvm_vcpu_init *init) 1170 + static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu, 1171 + const struct kvm_vcpu_init *init) 1181 1172 { 1182 - unsigned int i, ret; 1183 - u32 phys_target = kvm_target_cpu(); 1173 + unsigned long features = init->features[0]; 1174 + int i; 1184 1175 1185 - if (init->target != phys_target) 1186 - return -EINVAL; 1176 + if (features & ~KVM_VCPU_VALID_FEATURES) 1177 + return -ENOENT; 1187 1178 1188 - /* 1189 - * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must 1190 - * use the same target. 1191 - */ 1192 - if (vcpu->arch.target != -1 && vcpu->arch.target != init->target) 1193 - return -EINVAL; 1194 - 1195 - /* -ENOENT for unknown features, -EINVAL for invalid combinations. */ 1196 - for (i = 0; i < sizeof(init->features) * 8; i++) { 1197 - bool set = (init->features[i / 32] & (1 << (i % 32))); 1198 - 1199 - if (set && i >= KVM_VCPU_MAX_FEATURES) 1179 + for (i = 1; i < ARRAY_SIZE(init->features); i++) { 1180 + if (init->features[i]) 1200 1181 return -ENOENT; 1201 - 1202 - /* 1203 - * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must 1204 - * use the same feature set. 1205 - */ 1206 - if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES && 1207 - test_bit(i, vcpu->arch.features) != set) 1208 - return -EINVAL; 1209 - 1210 - if (set) 1211 - set_bit(i, vcpu->arch.features); 1212 1182 } 1213 1183 1214 - vcpu->arch.target = phys_target; 1184 + if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features)) 1185 + return 0; 1186 + 1187 + if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1)) 1188 + return -EINVAL; 1189 + 1190 + /* MTE is incompatible with AArch32 */ 1191 + if (kvm_has_mte(vcpu->kvm)) 1192 + return -EINVAL; 1193 + 1194 + /* NV is incompatible with AArch32 */ 1195 + if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features)) 1196 + return -EINVAL; 1197 + 1198 + return 0; 1199 + } 1200 + 1201 + static bool kvm_vcpu_init_changed(struct kvm_vcpu *vcpu, 1202 + const struct kvm_vcpu_init *init) 1203 + { 1204 + unsigned long features = init->features[0]; 1205 + 1206 + return !bitmap_equal(vcpu->arch.features, &features, KVM_VCPU_MAX_FEATURES) || 1207 + vcpu->arch.target != init->target; 1208 + } 1209 + 1210 + static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu, 1211 + const struct kvm_vcpu_init *init) 1212 + { 1213 + unsigned long features = init->features[0]; 1214 + struct kvm *kvm = vcpu->kvm; 1215 + int ret = -EINVAL; 1216 + 1217 + mutex_lock(&kvm->arch.config_lock); 1218 + 1219 + if (test_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags) && 1220 + !bitmap_equal(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES)) 1221 + goto out_unlock; 1222 + 1223 + vcpu->arch.target = init->target; 1224 + bitmap_copy(vcpu->arch.features, &features, KVM_VCPU_MAX_FEATURES); 1215 1225 1216 1226 /* Now we know what it is, we can reset it. */ 1217 1227 ret = kvm_reset_vcpu(vcpu); 1218 1228 if (ret) { 1219 1229 vcpu->arch.target = -1; 1220 1230 bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); 1231 + goto out_unlock; 1221 1232 } 1222 1233 1234 + bitmap_copy(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES); 1235 + set_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags); 1236 + 1237 + out_unlock: 1238 + mutex_unlock(&kvm->arch.config_lock); 1223 1239 return ret; 1240 + } 1241 + 1242 + static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, 1243 + const struct kvm_vcpu_init *init) 1244 + { 1245 + int ret; 1246 + 1247 + if (init->target != kvm_target_cpu()) 1248 + return -EINVAL; 1249 + 1250 + ret = kvm_vcpu_init_check_features(vcpu, init); 1251 + if (ret) 1252 + return ret; 1253 + 1254 + if (vcpu->arch.target == -1) 1255 + return __kvm_vcpu_set_target(vcpu, init); 1256 + 1257 + if (kvm_vcpu_init_changed(vcpu, init)) 1258 + return -EINVAL; 1259 + 1260 + return kvm_reset_vcpu(vcpu); 1224 1261 } 1225 1262 1226 1263 static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, 1227 1264 struct kvm_vcpu_init *init) 1228 1265 { 1266 + bool power_off = false; 1229 1267 int ret; 1268 + 1269 + /* 1270 + * Treat the power-off vCPU feature as ephemeral. Clear the bit to avoid 1271 + * reflecting it in the finalized feature set, thus limiting its scope 1272 + * to a single KVM_ARM_VCPU_INIT call. 1273 + */ 1274 + if (init->features[0] & BIT(KVM_ARM_VCPU_POWER_OFF)) { 1275 + init->features[0] &= ~BIT(KVM_ARM_VCPU_POWER_OFF); 1276 + power_off = true; 1277 + } 1230 1278 1231 1279 ret = kvm_vcpu_set_target(vcpu, init); 1232 1280 if (ret) ··· 1306 1240 } 1307 1241 1308 1242 vcpu_reset_hcr(vcpu); 1309 - vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT; 1243 + vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu); 1310 1244 1311 1245 /* 1312 1246 * Handle the "start in power-off" case. 1313 1247 */ 1314 1248 spin_lock(&vcpu->arch.mp_state_lock); 1315 1249 1316 - if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) 1250 + if (power_off) 1317 1251 __kvm_arm_vcpu_power_off(vcpu); 1318 1252 else 1319 1253 WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE); ··· 1732 1666 1733 1667 params->mair_el2 = read_sysreg(mair_el1); 1734 1668 1735 - tcr = (read_sysreg(tcr_el1) & TCR_EL2_MASK) | TCR_EL2_RES1; 1669 + tcr = read_sysreg(tcr_el1); 1670 + if (cpus_have_final_cap(ARM64_KVM_HVHE)) { 1671 + tcr |= TCR_EPD1_MASK; 1672 + } else { 1673 + tcr &= TCR_EL2_MASK; 1674 + tcr |= TCR_EL2_RES1; 1675 + } 1736 1676 tcr &= ~TCR_T0SZ_MASK; 1737 1677 tcr |= TCR_T0SZ(hyp_va_bits); 1738 1678 params->tcr_el2 = tcr; ··· 1748 1676 params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS; 1749 1677 else 1750 1678 params->hcr_el2 = HCR_HOST_NVHE_FLAGS; 1679 + if (cpus_have_final_cap(ARM64_KVM_HVHE)) 1680 + params->hcr_el2 |= HCR_E2H; 1751 1681 params->vttbr = params->vtcr = 0; 1752 1682 1753 1683 /* ··· 1984 1910 } 1985 1911 1986 1912 kvm_host_psci_config.version = psci_ops.get_version(); 1913 + kvm_host_psci_config.smccc_version = arm_smccc_get_version(); 1987 1914 1988 1915 if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) { 1989 1916 kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids(); ··· 2140 2065 free_hyp_pgds(); 2141 2066 2142 2067 return 0; 2068 + } 2069 + 2070 + static void pkvm_hyp_init_ptrauth(void) 2071 + { 2072 + struct kvm_cpu_context *hyp_ctxt; 2073 + int cpu; 2074 + 2075 + for_each_possible_cpu(cpu) { 2076 + hyp_ctxt = per_cpu_ptr_nvhe_sym(kvm_hyp_ctxt, cpu); 2077 + hyp_ctxt->sys_regs[APIAKEYLO_EL1] = get_random_long(); 2078 + hyp_ctxt->sys_regs[APIAKEYHI_EL1] = get_random_long(); 2079 + hyp_ctxt->sys_regs[APIBKEYLO_EL1] = get_random_long(); 2080 + hyp_ctxt->sys_regs[APIBKEYHI_EL1] = get_random_long(); 2081 + hyp_ctxt->sys_regs[APDAKEYLO_EL1] = get_random_long(); 2082 + hyp_ctxt->sys_regs[APDAKEYHI_EL1] = get_random_long(); 2083 + hyp_ctxt->sys_regs[APDBKEYLO_EL1] = get_random_long(); 2084 + hyp_ctxt->sys_regs[APDBKEYHI_EL1] = get_random_long(); 2085 + hyp_ctxt->sys_regs[APGAKEYLO_EL1] = get_random_long(); 2086 + hyp_ctxt->sys_regs[APGAKEYHI_EL1] = get_random_long(); 2087 + } 2143 2088 } 2144 2089 2145 2090 /* Inits Hyp-mode on all online CPUs */ ··· 2323 2228 kvm_hyp_init_symbols(); 2324 2229 2325 2230 if (is_protected_kvm_enabled()) { 2231 + if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) && 2232 + cpus_have_const_cap(ARM64_HAS_ADDRESS_AUTH)) 2233 + pkvm_hyp_init_ptrauth(); 2234 + 2326 2235 init_cpu_logical_map(); 2327 2236 2328 2237 if (!init_psci_relay()) {

+2 -2

arch/arm64/kvm/fpsimd.c

··· 180 180 181 181 /* 182 182 * If we have VHE then the Hyp code will reset CPACR_EL1 to 183 - * CPACR_EL1_DEFAULT and we need to reenable SME. 183 + * the default value and we need to reenable SME. 184 184 */ 185 185 if (has_vhe() && system_supports_sme()) { 186 186 /* Also restore EL0 state seen on entry */ ··· 210 210 /* 211 211 * The FPSIMD/SVE state in the CPU has not been touched, and we 212 212 * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been 213 - * reset to CPACR_EL1_DEFAULT by the Hyp code, disabling SVE 213 + * reset by kvm_reset_cptr_el2() in the Hyp code, disabling SVE 214 214 * for EL0. To avoid spurious traps, restore the trap state 215 215 * seen by kvm_arch_vcpu_load_fp(): 216 216 */

+82 -19

arch/arm64/kvm/hyp/include/hyp/switch.h

··· 70 70 } 71 71 } 72 72 73 + static inline bool __hfgxtr_traps_required(void) 74 + { 75 + if (cpus_have_final_cap(ARM64_SME)) 76 + return true; 77 + 78 + if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) 79 + return true; 80 + 81 + return false; 82 + } 83 + 84 + static inline void __activate_traps_hfgxtr(void) 85 + { 86 + u64 r_clr = 0, w_clr = 0, r_set = 0, w_set = 0, tmp; 87 + 88 + if (cpus_have_final_cap(ARM64_SME)) { 89 + tmp = HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK; 90 + 91 + r_clr |= tmp; 92 + w_clr |= tmp; 93 + } 94 + 95 + /* 96 + * Trap guest writes to TCR_EL1 to prevent it from enabling HA or HD. 97 + */ 98 + if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) 99 + w_set |= HFGxTR_EL2_TCR_EL1_MASK; 100 + 101 + sysreg_clear_set_s(SYS_HFGRTR_EL2, r_clr, r_set); 102 + sysreg_clear_set_s(SYS_HFGWTR_EL2, w_clr, w_set); 103 + } 104 + 105 + static inline void __deactivate_traps_hfgxtr(void) 106 + { 107 + u64 r_clr = 0, w_clr = 0, r_set = 0, w_set = 0, tmp; 108 + 109 + if (cpus_have_final_cap(ARM64_SME)) { 110 + tmp = HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK; 111 + 112 + r_set |= tmp; 113 + w_set |= tmp; 114 + } 115 + 116 + if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) 117 + w_clr |= HFGxTR_EL2_TCR_EL1_MASK; 118 + 119 + sysreg_clear_set_s(SYS_HFGRTR_EL2, r_clr, r_set); 120 + sysreg_clear_set_s(SYS_HFGWTR_EL2, w_clr, w_set); 121 + } 122 + 73 123 static inline void __activate_traps_common(struct kvm_vcpu *vcpu) 74 124 { 75 125 /* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */ ··· 145 95 vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2); 146 96 write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); 147 97 148 - if (cpus_have_final_cap(ARM64_SME)) { 149 - sysreg_clear_set_s(SYS_HFGRTR_EL2, 150 - HFGxTR_EL2_nSMPRI_EL1_MASK | 151 - HFGxTR_EL2_nTPIDR2_EL0_MASK, 152 - 0); 153 - sysreg_clear_set_s(SYS_HFGWTR_EL2, 154 - HFGxTR_EL2_nSMPRI_EL1_MASK | 155 - HFGxTR_EL2_nTPIDR2_EL0_MASK, 156 - 0); 157 - } 98 + if (__hfgxtr_traps_required()) 99 + __activate_traps_hfgxtr(); 158 100 } 159 101 160 102 static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) ··· 162 120 vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU); 163 121 } 164 122 165 - if (cpus_have_final_cap(ARM64_SME)) { 166 - sysreg_clear_set_s(SYS_HFGRTR_EL2, 0, 167 - HFGxTR_EL2_nSMPRI_EL1_MASK | 168 - HFGxTR_EL2_nTPIDR2_EL0_MASK); 169 - sysreg_clear_set_s(SYS_HFGWTR_EL2, 0, 170 - HFGxTR_EL2_nSMPRI_EL1_MASK | 171 - HFGxTR_EL2_nTPIDR2_EL0_MASK); 172 - } 123 + if (__hfgxtr_traps_required()) 124 + __deactivate_traps_hfgxtr(); 173 125 } 174 126 175 127 static inline void ___activate_traps(struct kvm_vcpu *vcpu) ··· 239 203 /* Valid trap. Switch the context: */ 240 204 241 205 /* First disable enough traps to allow us to update the registers */ 242 - if (has_vhe()) { 206 + if (has_vhe() || has_hvhe()) { 243 207 reg = CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN; 244 208 if (sve_guest) 245 209 reg |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; ··· 431 395 return true; 432 396 } 433 397 398 + static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu) 399 + { 400 + u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); 401 + int rt = kvm_vcpu_sys_get_rt(vcpu); 402 + u64 val = vcpu_get_reg(vcpu, rt); 403 + 404 + if (sysreg != SYS_TCR_EL1) 405 + return false; 406 + 407 + /* 408 + * Affected parts do not advertise support for hardware Access Flag / 409 + * Dirty state management in ID_AA64MMFR1_EL1.HAFDBS, but the underlying 410 + * control bits are still functional. The architecture requires these be 411 + * RES0 on systems that do not implement FEAT_HAFDBS. 412 + * 413 + * Uphold the requirements of the architecture by masking guest writes 414 + * to TCR_EL1.{HA,HD} here. 415 + */ 416 + val &= ~(TCR_HD | TCR_HA); 417 + write_sysreg_el1(val, SYS_TCR); 418 + return true; 419 + } 420 + 434 421 static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) 435 422 { 436 423 if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && 437 424 handle_tx2_tvm(vcpu)) 425 + return true; 426 + 427 + if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38) && 428 + handle_ampere1_tcr(vcpu)) 438 429 return true; 439 430 440 431 if (static_branch_unlikely(&vgic_v3_cpuif_trap) &&

+17

arch/arm64/kvm/hyp/include/nvhe/ffa.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (C) 2022 - Google LLC 4 + * Author: Andrew Walbran <qwandor@google.com> 5 + */ 6 + #ifndef __KVM_HYP_FFA_H 7 + #define __KVM_HYP_FFA_H 8 + 9 + #include <asm/kvm_host.h> 10 + 11 + #define FFA_MIN_FUNC_NUM 0x60 12 + #define FFA_MAX_FUNC_NUM 0x7F 13 + 14 + int hyp_ffa_init(void *pages); 15 + bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt); 16 + 17 + #endif /* __KVM_HYP_FFA_H */

+3

arch/arm64/kvm/hyp/include/nvhe/mem_protect.h

··· 57 57 enum pkvm_component_id { 58 58 PKVM_ID_HOST, 59 59 PKVM_ID_HYP, 60 + PKVM_ID_FFA, 60 61 }; 61 62 62 63 extern unsigned long hyp_nr_cpus; ··· 67 66 int __pkvm_host_unshare_hyp(u64 pfn); 68 67 int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages); 69 68 int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages); 69 + int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages); 70 + int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages); 70 71 71 72 bool addr_is_memory(phys_addr_t phys); 72 73 int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);

+1 -1

arch/arm64/kvm/hyp/nvhe/Makefile

··· 22 22 23 23 hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ 24 24 hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \ 25 - cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o 25 + cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o 26 26 hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ 27 27 ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o 28 28 hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o

+762

arch/arm64/kvm/hyp/nvhe/ffa.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * FF-A v1.0 proxy to filter out invalid memory-sharing SMC calls issued by 4 + * the host. FF-A is a slightly more palatable abbreviation of "Arm Firmware 5 + * Framework for Arm A-profile", which is specified by Arm in document 6 + * number DEN0077. 7 + * 8 + * Copyright (C) 2022 - Google LLC 9 + * Author: Andrew Walbran <qwandor@google.com> 10 + * 11 + * This driver hooks into the SMC trapping logic for the host and intercepts 12 + * all calls falling within the FF-A range. Each call is either: 13 + * 14 + * - Forwarded on unmodified to the SPMD at EL3 15 + * - Rejected as "unsupported" 16 + * - Accompanied by a host stage-2 page-table check/update and reissued 17 + * 18 + * Consequently, any attempts by the host to make guest memory pages 19 + * accessible to the secure world using FF-A will be detected either here 20 + * (in the case that the memory is already owned by the guest) or during 21 + * donation to the guest (in the case that the memory was previously shared 22 + * with the secure world). 23 + * 24 + * To allow the rolling-back of page-table updates and FF-A calls in the 25 + * event of failure, operations involving the RXTX buffers are locked for 26 + * the duration and are therefore serialised. 27 + */ 28 + 29 + #include <linux/arm-smccc.h> 30 + #include <linux/arm_ffa.h> 31 + #include <asm/kvm_pkvm.h> 32 + 33 + #include <nvhe/ffa.h> 34 + #include <nvhe/mem_protect.h> 35 + #include <nvhe/memory.h> 36 + #include <nvhe/trap_handler.h> 37 + #include <nvhe/spinlock.h> 38 + 39 + /* 40 + * "ID value 0 must be returned at the Non-secure physical FF-A instance" 41 + * We share this ID with the host. 42 + */ 43 + #define HOST_FFA_ID 0 44 + 45 + /* 46 + * A buffer to hold the maximum descriptor size we can see from the host, 47 + * which is required when the SPMD returns a fragmented FFA_MEM_RETRIEVE_RESP 48 + * when resolving the handle on the reclaim path. 49 + */ 50 + struct kvm_ffa_descriptor_buffer { 51 + void *buf; 52 + size_t len; 53 + }; 54 + 55 + static struct kvm_ffa_descriptor_buffer ffa_desc_buf; 56 + 57 + struct kvm_ffa_buffers { 58 + hyp_spinlock_t lock; 59 + void *tx; 60 + void *rx; 61 + }; 62 + 63 + /* 64 + * Note that we don't currently lock these buffers explicitly, instead 65 + * relying on the locking of the host FFA buffers as we only have one 66 + * client. 67 + */ 68 + static struct kvm_ffa_buffers hyp_buffers; 69 + static struct kvm_ffa_buffers host_buffers; 70 + 71 + static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno) 72 + { 73 + *res = (struct arm_smccc_res) { 74 + .a0 = FFA_ERROR, 75 + .a2 = ffa_errno, 76 + }; 77 + } 78 + 79 + static void ffa_to_smccc_res_prop(struct arm_smccc_res *res, int ret, u64 prop) 80 + { 81 + if (ret == FFA_RET_SUCCESS) { 82 + *res = (struct arm_smccc_res) { .a0 = FFA_SUCCESS, 83 + .a2 = prop }; 84 + } else { 85 + ffa_to_smccc_error(res, ret); 86 + } 87 + } 88 + 89 + static void ffa_to_smccc_res(struct arm_smccc_res *res, int ret) 90 + { 91 + ffa_to_smccc_res_prop(res, ret, 0); 92 + } 93 + 94 + static void ffa_set_retval(struct kvm_cpu_context *ctxt, 95 + struct arm_smccc_res *res) 96 + { 97 + cpu_reg(ctxt, 0) = res->a0; 98 + cpu_reg(ctxt, 1) = res->a1; 99 + cpu_reg(ctxt, 2) = res->a2; 100 + cpu_reg(ctxt, 3) = res->a3; 101 + } 102 + 103 + static bool is_ffa_call(u64 func_id) 104 + { 105 + return ARM_SMCCC_IS_FAST_CALL(func_id) && 106 + ARM_SMCCC_OWNER_NUM(func_id) == ARM_SMCCC_OWNER_STANDARD && 107 + ARM_SMCCC_FUNC_NUM(func_id) >= FFA_MIN_FUNC_NUM && 108 + ARM_SMCCC_FUNC_NUM(func_id) <= FFA_MAX_FUNC_NUM; 109 + } 110 + 111 + static int ffa_map_hyp_buffers(u64 ffa_page_count) 112 + { 113 + struct arm_smccc_res res; 114 + 115 + arm_smccc_1_1_smc(FFA_FN64_RXTX_MAP, 116 + hyp_virt_to_phys(hyp_buffers.tx), 117 + hyp_virt_to_phys(hyp_buffers.rx), 118 + ffa_page_count, 119 + 0, 0, 0, 0, 120 + &res); 121 + 122 + return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; 123 + } 124 + 125 + static int ffa_unmap_hyp_buffers(void) 126 + { 127 + struct arm_smccc_res res; 128 + 129 + arm_smccc_1_1_smc(FFA_RXTX_UNMAP, 130 + HOST_FFA_ID, 131 + 0, 0, 0, 0, 0, 0, 132 + &res); 133 + 134 + return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; 135 + } 136 + 137 + static void ffa_mem_frag_tx(struct arm_smccc_res *res, u32 handle_lo, 138 + u32 handle_hi, u32 fraglen, u32 endpoint_id) 139 + { 140 + arm_smccc_1_1_smc(FFA_MEM_FRAG_TX, 141 + handle_lo, handle_hi, fraglen, endpoint_id, 142 + 0, 0, 0, 143 + res); 144 + } 145 + 146 + static void ffa_mem_frag_rx(struct arm_smccc_res *res, u32 handle_lo, 147 + u32 handle_hi, u32 fragoff) 148 + { 149 + arm_smccc_1_1_smc(FFA_MEM_FRAG_RX, 150 + handle_lo, handle_hi, fragoff, HOST_FFA_ID, 151 + 0, 0, 0, 152 + res); 153 + } 154 + 155 + static void ffa_mem_xfer(struct arm_smccc_res *res, u64 func_id, u32 len, 156 + u32 fraglen) 157 + { 158 + arm_smccc_1_1_smc(func_id, len, fraglen, 159 + 0, 0, 0, 0, 0, 160 + res); 161 + } 162 + 163 + static void ffa_mem_reclaim(struct arm_smccc_res *res, u32 handle_lo, 164 + u32 handle_hi, u32 flags) 165 + { 166 + arm_smccc_1_1_smc(FFA_MEM_RECLAIM, 167 + handle_lo, handle_hi, flags, 168 + 0, 0, 0, 0, 169 + res); 170 + } 171 + 172 + static void ffa_retrieve_req(struct arm_smccc_res *res, u32 len) 173 + { 174 + arm_smccc_1_1_smc(FFA_FN64_MEM_RETRIEVE_REQ, 175 + len, len, 176 + 0, 0, 0, 0, 0, 177 + res); 178 + } 179 + 180 + static void do_ffa_rxtx_map(struct arm_smccc_res *res, 181 + struct kvm_cpu_context *ctxt) 182 + { 183 + DECLARE_REG(phys_addr_t, tx, ctxt, 1); 184 + DECLARE_REG(phys_addr_t, rx, ctxt, 2); 185 + DECLARE_REG(u32, npages, ctxt, 3); 186 + int ret = 0; 187 + void *rx_virt, *tx_virt; 188 + 189 + if (npages != (KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) / FFA_PAGE_SIZE) { 190 + ret = FFA_RET_INVALID_PARAMETERS; 191 + goto out; 192 + } 193 + 194 + if (!PAGE_ALIGNED(tx) || !PAGE_ALIGNED(rx)) { 195 + ret = FFA_RET_INVALID_PARAMETERS; 196 + goto out; 197 + } 198 + 199 + hyp_spin_lock(&host_buffers.lock); 200 + if (host_buffers.tx) { 201 + ret = FFA_RET_DENIED; 202 + goto out_unlock; 203 + } 204 + 205 + /* 206 + * Map our hypervisor buffers into the SPMD before mapping and 207 + * pinning the host buffers in our own address space. 208 + */ 209 + ret = ffa_map_hyp_buffers(npages); 210 + if (ret) 211 + goto out_unlock; 212 + 213 + ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(tx)); 214 + if (ret) { 215 + ret = FFA_RET_INVALID_PARAMETERS; 216 + goto err_unmap; 217 + } 218 + 219 + ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(rx)); 220 + if (ret) { 221 + ret = FFA_RET_INVALID_PARAMETERS; 222 + goto err_unshare_tx; 223 + } 224 + 225 + tx_virt = hyp_phys_to_virt(tx); 226 + ret = hyp_pin_shared_mem(tx_virt, tx_virt + 1); 227 + if (ret) { 228 + ret = FFA_RET_INVALID_PARAMETERS; 229 + goto err_unshare_rx; 230 + } 231 + 232 + rx_virt = hyp_phys_to_virt(rx); 233 + ret = hyp_pin_shared_mem(rx_virt, rx_virt + 1); 234 + if (ret) { 235 + ret = FFA_RET_INVALID_PARAMETERS; 236 + goto err_unpin_tx; 237 + } 238 + 239 + host_buffers.tx = tx_virt; 240 + host_buffers.rx = rx_virt; 241 + 242 + out_unlock: 243 + hyp_spin_unlock(&host_buffers.lock); 244 + out: 245 + ffa_to_smccc_res(res, ret); 246 + return; 247 + 248 + err_unpin_tx: 249 + hyp_unpin_shared_mem(tx_virt, tx_virt + 1); 250 + err_unshare_rx: 251 + __pkvm_host_unshare_hyp(hyp_phys_to_pfn(rx)); 252 + err_unshare_tx: 253 + __pkvm_host_unshare_hyp(hyp_phys_to_pfn(tx)); 254 + err_unmap: 255 + ffa_unmap_hyp_buffers(); 256 + goto out_unlock; 257 + } 258 + 259 + static void do_ffa_rxtx_unmap(struct arm_smccc_res *res, 260 + struct kvm_cpu_context *ctxt) 261 + { 262 + DECLARE_REG(u32, id, ctxt, 1); 263 + int ret = 0; 264 + 265 + if (id != HOST_FFA_ID) { 266 + ret = FFA_RET_INVALID_PARAMETERS; 267 + goto out; 268 + } 269 + 270 + hyp_spin_lock(&host_buffers.lock); 271 + if (!host_buffers.tx) { 272 + ret = FFA_RET_INVALID_PARAMETERS; 273 + goto out_unlock; 274 + } 275 + 276 + hyp_unpin_shared_mem(host_buffers.tx, host_buffers.tx + 1); 277 + WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.tx))); 278 + host_buffers.tx = NULL; 279 + 280 + hyp_unpin_shared_mem(host_buffers.rx, host_buffers.rx + 1); 281 + WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.rx))); 282 + host_buffers.rx = NULL; 283 + 284 + ffa_unmap_hyp_buffers(); 285 + 286 + out_unlock: 287 + hyp_spin_unlock(&host_buffers.lock); 288 + out: 289 + ffa_to_smccc_res(res, ret); 290 + } 291 + 292 + static u32 __ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges, 293 + u32 nranges) 294 + { 295 + u32 i; 296 + 297 + for (i = 0; i < nranges; ++i) { 298 + struct ffa_mem_region_addr_range *range = &ranges[i]; 299 + u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE; 300 + u64 pfn = hyp_phys_to_pfn(range->address); 301 + 302 + if (!PAGE_ALIGNED(sz)) 303 + break; 304 + 305 + if (__pkvm_host_share_ffa(pfn, sz / PAGE_SIZE)) 306 + break; 307 + } 308 + 309 + return i; 310 + } 311 + 312 + static u32 __ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges, 313 + u32 nranges) 314 + { 315 + u32 i; 316 + 317 + for (i = 0; i < nranges; ++i) { 318 + struct ffa_mem_region_addr_range *range = &ranges[i]; 319 + u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE; 320 + u64 pfn = hyp_phys_to_pfn(range->address); 321 + 322 + if (!PAGE_ALIGNED(sz)) 323 + break; 324 + 325 + if (__pkvm_host_unshare_ffa(pfn, sz / PAGE_SIZE)) 326 + break; 327 + } 328 + 329 + return i; 330 + } 331 + 332 + static int ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges, 333 + u32 nranges) 334 + { 335 + u32 nshared = __ffa_host_share_ranges(ranges, nranges); 336 + int ret = 0; 337 + 338 + if (nshared != nranges) { 339 + WARN_ON(__ffa_host_unshare_ranges(ranges, nshared) != nshared); 340 + ret = FFA_RET_DENIED; 341 + } 342 + 343 + return ret; 344 + } 345 + 346 + static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges, 347 + u32 nranges) 348 + { 349 + u32 nunshared = __ffa_host_unshare_ranges(ranges, nranges); 350 + int ret = 0; 351 + 352 + if (nunshared != nranges) { 353 + WARN_ON(__ffa_host_share_ranges(ranges, nunshared) != nunshared); 354 + ret = FFA_RET_DENIED; 355 + } 356 + 357 + return ret; 358 + } 359 + 360 + static void do_ffa_mem_frag_tx(struct arm_smccc_res *res, 361 + struct kvm_cpu_context *ctxt) 362 + { 363 + DECLARE_REG(u32, handle_lo, ctxt, 1); 364 + DECLARE_REG(u32, handle_hi, ctxt, 2); 365 + DECLARE_REG(u32, fraglen, ctxt, 3); 366 + DECLARE_REG(u32, endpoint_id, ctxt, 4); 367 + struct ffa_mem_region_addr_range *buf; 368 + int ret = FFA_RET_INVALID_PARAMETERS; 369 + u32 nr_ranges; 370 + 371 + if (fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) 372 + goto out; 373 + 374 + if (fraglen % sizeof(*buf)) 375 + goto out; 376 + 377 + hyp_spin_lock(&host_buffers.lock); 378 + if (!host_buffers.tx) 379 + goto out_unlock; 380 + 381 + buf = hyp_buffers.tx; 382 + memcpy(buf, host_buffers.tx, fraglen); 383 + nr_ranges = fraglen / sizeof(*buf); 384 + 385 + ret = ffa_host_share_ranges(buf, nr_ranges); 386 + if (ret) { 387 + /* 388 + * We're effectively aborting the transaction, so we need 389 + * to restore the global state back to what it was prior to 390 + * transmission of the first fragment. 391 + */ 392 + ffa_mem_reclaim(res, handle_lo, handle_hi, 0); 393 + WARN_ON(res->a0 != FFA_SUCCESS); 394 + goto out_unlock; 395 + } 396 + 397 + ffa_mem_frag_tx(res, handle_lo, handle_hi, fraglen, endpoint_id); 398 + if (res->a0 != FFA_SUCCESS && res->a0 != FFA_MEM_FRAG_RX) 399 + WARN_ON(ffa_host_unshare_ranges(buf, nr_ranges)); 400 + 401 + out_unlock: 402 + hyp_spin_unlock(&host_buffers.lock); 403 + out: 404 + if (ret) 405 + ffa_to_smccc_res(res, ret); 406 + 407 + /* 408 + * If for any reason this did not succeed, we're in trouble as we have 409 + * now lost the content of the previous fragments and we can't rollback 410 + * the host stage-2 changes. The pages previously marked as shared will 411 + * remain stuck in that state forever, hence preventing the host from 412 + * sharing/donating them again and may possibly lead to subsequent 413 + * failures, but this will not compromise confidentiality. 414 + */ 415 + return; 416 + } 417 + 418 + static __always_inline void do_ffa_mem_xfer(const u64 func_id, 419 + struct arm_smccc_res *res, 420 + struct kvm_cpu_context *ctxt) 421 + { 422 + DECLARE_REG(u32, len, ctxt, 1); 423 + DECLARE_REG(u32, fraglen, ctxt, 2); 424 + DECLARE_REG(u64, addr_mbz, ctxt, 3); 425 + DECLARE_REG(u32, npages_mbz, ctxt, 4); 426 + struct ffa_composite_mem_region *reg; 427 + struct ffa_mem_region *buf; 428 + u32 offset, nr_ranges; 429 + int ret = 0; 430 + 431 + BUILD_BUG_ON(func_id != FFA_FN64_MEM_SHARE && 432 + func_id != FFA_FN64_MEM_LEND); 433 + 434 + if (addr_mbz || npages_mbz || fraglen > len || 435 + fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) { 436 + ret = FFA_RET_INVALID_PARAMETERS; 437 + goto out; 438 + } 439 + 440 + if (fraglen < sizeof(struct ffa_mem_region) + 441 + sizeof(struct ffa_mem_region_attributes)) { 442 + ret = FFA_RET_INVALID_PARAMETERS; 443 + goto out; 444 + } 445 + 446 + hyp_spin_lock(&host_buffers.lock); 447 + if (!host_buffers.tx) { 448 + ret = FFA_RET_INVALID_PARAMETERS; 449 + goto out_unlock; 450 + } 451 + 452 + buf = hyp_buffers.tx; 453 + memcpy(buf, host_buffers.tx, fraglen); 454 + 455 + offset = buf->ep_mem_access[0].composite_off; 456 + if (!offset || buf->ep_count != 1 || buf->sender_id != HOST_FFA_ID) { 457 + ret = FFA_RET_INVALID_PARAMETERS; 458 + goto out_unlock; 459 + } 460 + 461 + if (fraglen < offset + sizeof(struct ffa_composite_mem_region)) { 462 + ret = FFA_RET_INVALID_PARAMETERS; 463 + goto out_unlock; 464 + } 465 + 466 + reg = (void *)buf + offset; 467 + nr_ranges = ((void *)buf + fraglen) - (void *)reg->constituents; 468 + if (nr_ranges % sizeof(reg->constituents[0])) { 469 + ret = FFA_RET_INVALID_PARAMETERS; 470 + goto out_unlock; 471 + } 472 + 473 + nr_ranges /= sizeof(reg->constituents[0]); 474 + ret = ffa_host_share_ranges(reg->constituents, nr_ranges); 475 + if (ret) 476 + goto out_unlock; 477 + 478 + ffa_mem_xfer(res, func_id, len, fraglen); 479 + if (fraglen != len) { 480 + if (res->a0 != FFA_MEM_FRAG_RX) 481 + goto err_unshare; 482 + 483 + if (res->a3 != fraglen) 484 + goto err_unshare; 485 + } else if (res->a0 != FFA_SUCCESS) { 486 + goto err_unshare; 487 + } 488 + 489 + out_unlock: 490 + hyp_spin_unlock(&host_buffers.lock); 491 + out: 492 + if (ret) 493 + ffa_to_smccc_res(res, ret); 494 + return; 495 + 496 + err_unshare: 497 + WARN_ON(ffa_host_unshare_ranges(reg->constituents, nr_ranges)); 498 + goto out_unlock; 499 + } 500 + 501 + static void do_ffa_mem_reclaim(struct arm_smccc_res *res, 502 + struct kvm_cpu_context *ctxt) 503 + { 504 + DECLARE_REG(u32, handle_lo, ctxt, 1); 505 + DECLARE_REG(u32, handle_hi, ctxt, 2); 506 + DECLARE_REG(u32, flags, ctxt, 3); 507 + struct ffa_composite_mem_region *reg; 508 + u32 offset, len, fraglen, fragoff; 509 + struct ffa_mem_region *buf; 510 + int ret = 0; 511 + u64 handle; 512 + 513 + handle = PACK_HANDLE(handle_lo, handle_hi); 514 + 515 + hyp_spin_lock(&host_buffers.lock); 516 + 517 + buf = hyp_buffers.tx; 518 + *buf = (struct ffa_mem_region) { 519 + .sender_id = HOST_FFA_ID, 520 + .handle = handle, 521 + }; 522 + 523 + ffa_retrieve_req(res, sizeof(*buf)); 524 + buf = hyp_buffers.rx; 525 + if (res->a0 != FFA_MEM_RETRIEVE_RESP) 526 + goto out_unlock; 527 + 528 + len = res->a1; 529 + fraglen = res->a2; 530 + 531 + offset = buf->ep_mem_access[0].composite_off; 532 + /* 533 + * We can trust the SPMD to get this right, but let's at least 534 + * check that we end up with something that doesn't look _completely_ 535 + * bogus. 536 + */ 537 + if (WARN_ON(offset > len || 538 + fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) { 539 + ret = FFA_RET_ABORTED; 540 + goto out_unlock; 541 + } 542 + 543 + if (len > ffa_desc_buf.len) { 544 + ret = FFA_RET_NO_MEMORY; 545 + goto out_unlock; 546 + } 547 + 548 + buf = ffa_desc_buf.buf; 549 + memcpy(buf, hyp_buffers.rx, fraglen); 550 + 551 + for (fragoff = fraglen; fragoff < len; fragoff += fraglen) { 552 + ffa_mem_frag_rx(res, handle_lo, handle_hi, fragoff); 553 + if (res->a0 != FFA_MEM_FRAG_TX) { 554 + ret = FFA_RET_INVALID_PARAMETERS; 555 + goto out_unlock; 556 + } 557 + 558 + fraglen = res->a3; 559 + memcpy((void *)buf + fragoff, hyp_buffers.rx, fraglen); 560 + } 561 + 562 + ffa_mem_reclaim(res, handle_lo, handle_hi, flags); 563 + if (res->a0 != FFA_SUCCESS) 564 + goto out_unlock; 565 + 566 + reg = (void *)buf + offset; 567 + /* If the SPMD was happy, then we should be too. */ 568 + WARN_ON(ffa_host_unshare_ranges(reg->constituents, 569 + reg->addr_range_cnt)); 570 + out_unlock: 571 + hyp_spin_unlock(&host_buffers.lock); 572 + 573 + if (ret) 574 + ffa_to_smccc_res(res, ret); 575 + } 576 + 577 + /* 578 + * Is a given FFA function supported, either by forwarding on directly 579 + * or by handling at EL2? 580 + */ 581 + static bool ffa_call_supported(u64 func_id) 582 + { 583 + switch (func_id) { 584 + /* Unsupported memory management calls */ 585 + case FFA_FN64_MEM_RETRIEVE_REQ: 586 + case FFA_MEM_RETRIEVE_RESP: 587 + case FFA_MEM_RELINQUISH: 588 + case FFA_MEM_OP_PAUSE: 589 + case FFA_MEM_OP_RESUME: 590 + case FFA_MEM_FRAG_RX: 591 + case FFA_FN64_MEM_DONATE: 592 + /* Indirect message passing via RX/TX buffers */ 593 + case FFA_MSG_SEND: 594 + case FFA_MSG_POLL: 595 + case FFA_MSG_WAIT: 596 + /* 32-bit variants of 64-bit calls */ 597 + case FFA_MSG_SEND_DIRECT_REQ: 598 + case FFA_MSG_SEND_DIRECT_RESP: 599 + case FFA_RXTX_MAP: 600 + case FFA_MEM_DONATE: 601 + case FFA_MEM_RETRIEVE_REQ: 602 + return false; 603 + } 604 + 605 + return true; 606 + } 607 + 608 + static bool do_ffa_features(struct arm_smccc_res *res, 609 + struct kvm_cpu_context *ctxt) 610 + { 611 + DECLARE_REG(u32, id, ctxt, 1); 612 + u64 prop = 0; 613 + int ret = 0; 614 + 615 + if (!ffa_call_supported(id)) { 616 + ret = FFA_RET_NOT_SUPPORTED; 617 + goto out_handled; 618 + } 619 + 620 + switch (id) { 621 + case FFA_MEM_SHARE: 622 + case FFA_FN64_MEM_SHARE: 623 + case FFA_MEM_LEND: 624 + case FFA_FN64_MEM_LEND: 625 + ret = FFA_RET_SUCCESS; 626 + prop = 0; /* No support for dynamic buffers */ 627 + goto out_handled; 628 + default: 629 + return false; 630 + } 631 + 632 + out_handled: 633 + ffa_to_smccc_res_prop(res, ret, prop); 634 + return true; 635 + } 636 + 637 + bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt) 638 + { 639 + DECLARE_REG(u64, func_id, host_ctxt, 0); 640 + struct arm_smccc_res res; 641 + 642 + /* 643 + * There's no way we can tell what a non-standard SMC call might 644 + * be up to. Ideally, we would terminate these here and return 645 + * an error to the host, but sadly devices make use of custom 646 + * firmware calls for things like power management, debugging, 647 + * RNG access and crash reporting. 648 + * 649 + * Given that the architecture requires us to trust EL3 anyway, 650 + * we forward unrecognised calls on under the assumption that 651 + * the firmware doesn't expose a mechanism to access arbitrary 652 + * non-secure memory. Short of a per-device table of SMCs, this 653 + * is the best we can do. 654 + */ 655 + if (!is_ffa_call(func_id)) 656 + return false; 657 + 658 + switch (func_id) { 659 + case FFA_FEATURES: 660 + if (!do_ffa_features(&res, host_ctxt)) 661 + return false; 662 + goto out_handled; 663 + /* Memory management */ 664 + case FFA_FN64_RXTX_MAP: 665 + do_ffa_rxtx_map(&res, host_ctxt); 666 + goto out_handled; 667 + case FFA_RXTX_UNMAP: 668 + do_ffa_rxtx_unmap(&res, host_ctxt); 669 + goto out_handled; 670 + case FFA_MEM_SHARE: 671 + case FFA_FN64_MEM_SHARE: 672 + do_ffa_mem_xfer(FFA_FN64_MEM_SHARE, &res, host_ctxt); 673 + goto out_handled; 674 + case FFA_MEM_RECLAIM: 675 + do_ffa_mem_reclaim(&res, host_ctxt); 676 + goto out_handled; 677 + case FFA_MEM_LEND: 678 + case FFA_FN64_MEM_LEND: 679 + do_ffa_mem_xfer(FFA_FN64_MEM_LEND, &res, host_ctxt); 680 + goto out_handled; 681 + case FFA_MEM_FRAG_TX: 682 + do_ffa_mem_frag_tx(&res, host_ctxt); 683 + goto out_handled; 684 + } 685 + 686 + if (ffa_call_supported(func_id)) 687 + return false; /* Pass through */ 688 + 689 + ffa_to_smccc_error(&res, FFA_RET_NOT_SUPPORTED); 690 + out_handled: 691 + ffa_set_retval(host_ctxt, &res); 692 + return true; 693 + } 694 + 695 + int hyp_ffa_init(void *pages) 696 + { 697 + struct arm_smccc_res res; 698 + size_t min_rxtx_sz; 699 + void *tx, *rx; 700 + 701 + if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2) 702 + return 0; 703 + 704 + arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_0, 0, 0, 0, 0, 0, 0, &res); 705 + if (res.a0 == FFA_RET_NOT_SUPPORTED) 706 + return 0; 707 + 708 + if (res.a0 != FFA_VERSION_1_0) 709 + return -EOPNOTSUPP; 710 + 711 + arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res); 712 + if (res.a0 != FFA_SUCCESS) 713 + return -EOPNOTSUPP; 714 + 715 + if (res.a2 != HOST_FFA_ID) 716 + return -EINVAL; 717 + 718 + arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP, 719 + 0, 0, 0, 0, 0, 0, &res); 720 + if (res.a0 != FFA_SUCCESS) 721 + return -EOPNOTSUPP; 722 + 723 + switch (res.a2) { 724 + case FFA_FEAT_RXTX_MIN_SZ_4K: 725 + min_rxtx_sz = SZ_4K; 726 + break; 727 + case FFA_FEAT_RXTX_MIN_SZ_16K: 728 + min_rxtx_sz = SZ_16K; 729 + break; 730 + case FFA_FEAT_RXTX_MIN_SZ_64K: 731 + min_rxtx_sz = SZ_64K; 732 + break; 733 + default: 734 + return -EINVAL; 735 + } 736 + 737 + if (min_rxtx_sz > PAGE_SIZE) 738 + return -EOPNOTSUPP; 739 + 740 + tx = pages; 741 + pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE; 742 + rx = pages; 743 + pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE; 744 + 745 + ffa_desc_buf = (struct kvm_ffa_descriptor_buffer) { 746 + .buf = pages, 747 + .len = PAGE_SIZE * 748 + (hyp_ffa_proxy_pages() - (2 * KVM_FFA_MBOX_NR_PAGES)), 749 + }; 750 + 751 + hyp_buffers = (struct kvm_ffa_buffers) { 752 + .lock = __HYP_SPIN_LOCK_UNLOCKED, 753 + .tx = tx, 754 + .rx = rx, 755 + }; 756 + 757 + host_buffers = (struct kvm_ffa_buffers) { 758 + .lock = __HYP_SPIN_LOCK_UNLOCKED, 759 + }; 760 + 761 + return 0; 762 + }

+35 -1

arch/arm64/kvm/hyp/nvhe/host.S

··· 10 10 #include <asm/kvm_arm.h> 11 11 #include <asm/kvm_asm.h> 12 12 #include <asm/kvm_mmu.h> 13 + #include <asm/kvm_ptrauth.h> 13 14 14 15 .text 15 16 ··· 38 37 39 38 /* Save the host context pointer in x29 across the function call */ 40 39 mov x29, x0 40 + 41 + #ifdef CONFIG_ARM64_PTR_AUTH_KERNEL 42 + alternative_if_not ARM64_HAS_ADDRESS_AUTH 43 + b __skip_pauth_save 44 + alternative_else_nop_endif 45 + 46 + alternative_if ARM64_KVM_PROTECTED_MODE 47 + /* Save kernel ptrauth keys. */ 48 + add x18, x29, #CPU_APIAKEYLO_EL1 49 + ptrauth_save_state x18, x19, x20 50 + 51 + /* Use hyp keys. */ 52 + adr_this_cpu x18, kvm_hyp_ctxt, x19 53 + add x18, x18, #CPU_APIAKEYLO_EL1 54 + ptrauth_restore_state x18, x19, x20 55 + isb 56 + alternative_else_nop_endif 57 + __skip_pauth_save: 58 + #endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */ 59 + 41 60 bl handle_trap 42 61 43 - /* Restore host regs x0-x17 */ 44 62 __host_enter_restore_full: 63 + /* Restore kernel keys. */ 64 + #ifdef CONFIG_ARM64_PTR_AUTH_KERNEL 65 + alternative_if_not ARM64_HAS_ADDRESS_AUTH 66 + b __skip_pauth_restore 67 + alternative_else_nop_endif 68 + 69 + alternative_if ARM64_KVM_PROTECTED_MODE 70 + add x18, x29, #CPU_APIAKEYLO_EL1 71 + ptrauth_restore_state x18, x19, x20 72 + alternative_else_nop_endif 73 + __skip_pauth_restore: 74 + #endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */ 75 + 76 + /* Restore host regs x0-x17 */ 45 77 ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)] 46 78 ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)] 47 79 ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)]

+29 -3

arch/arm64/kvm/hyp/nvhe/hyp-init.S

··· 83 83 * x0: struct kvm_nvhe_init_params PA 84 84 */ 85 85 SYM_CODE_START_LOCAL(___kvm_hyp_init) 86 - ldr x1, [x0, #NVHE_INIT_TPIDR_EL2] 87 - msr tpidr_el2, x1 88 - 89 86 ldr x1, [x0, #NVHE_INIT_STACK_HYP_VA] 90 87 mov sp, x1 91 88 ··· 91 94 92 95 ldr x1, [x0, #NVHE_INIT_HCR_EL2] 93 96 msr hcr_el2, x1 97 + 98 + mov x2, #HCR_E2H 99 + and x2, x1, x2 100 + cbz x2, 1f 101 + 102 + // hVHE: Replay the EL2 setup to account for the E2H bit 103 + // TPIDR_EL2 is used to preserve x0 across the macro maze... 104 + isb 105 + msr tpidr_el2, x0 106 + init_el2_state 107 + finalise_el2_state 108 + mrs x0, tpidr_el2 109 + 110 + 1: 111 + ldr x1, [x0, #NVHE_INIT_TPIDR_EL2] 112 + msr tpidr_el2, x1 94 113 95 114 ldr x1, [x0, #NVHE_INIT_VTTBR] 96 115 msr vttbr_el2, x1 ··· 141 128 SCTLR_ELx_ENDA | SCTLR_ELx_ENDB) 142 129 orr x0, x0, x1 143 130 alternative_else_nop_endif 131 + 132 + #ifdef CONFIG_ARM64_BTI_KERNEL 133 + alternative_if ARM64_BTI 134 + orr x0, x0, #SCTLR_EL2_BT 135 + alternative_else_nop_endif 136 + #endif /* CONFIG_ARM64_BTI_KERNEL */ 137 + 144 138 msr sctlr_el2, x0 145 139 isb 146 140 ··· 204 184 /* Initialize EL2 CPU state to sane values. */ 205 185 init_el2_state // Clobbers x0..x2 206 186 finalise_el2_state 187 + __init_el2_nvhe_prepare_eret 207 188 208 189 /* Enable MMU, set vectors and stack. */ 209 190 mov x0, x28 ··· 217 196 SYM_CODE_END(__kvm_hyp_init_cpu) 218 197 219 198 SYM_CODE_START(__kvm_handle_stub_hvc) 199 + /* 200 + * __kvm_handle_stub_hvc called from __host_hvc through branch instruction(br) so 201 + * we need bti j at beginning. 202 + */ 203 + bti j 220 204 cmp x0, #HVC_SOFT_RESTART 221 205 b.ne 1f 222 206

+18 -1

arch/arm64/kvm/hyp/nvhe/hyp-main.c

··· 13 13 #include <asm/kvm_hyp.h> 14 14 #include <asm/kvm_mmu.h> 15 15 16 + #include <nvhe/ffa.h> 16 17 #include <nvhe/mem_protect.h> 17 18 #include <nvhe/mm.h> 18 19 #include <nvhe/pkvm.h> ··· 124 123 DECLARE_REG(int, level, host_ctxt, 3); 125 124 126 125 __kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level); 126 + } 127 + 128 + static void handle___kvm_tlb_flush_vmid_ipa_nsh(struct kvm_cpu_context *host_ctxt) 129 + { 130 + DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); 131 + DECLARE_REG(phys_addr_t, ipa, host_ctxt, 2); 132 + DECLARE_REG(int, level, host_ctxt, 3); 133 + 134 + __kvm_tlb_flush_vmid_ipa_nsh(kern_hyp_va(mmu), ipa, level); 127 135 } 128 136 129 137 static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) ··· 325 315 HANDLE_FUNC(__kvm_vcpu_run), 326 316 HANDLE_FUNC(__kvm_flush_vm_context), 327 317 HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), 318 + HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa_nsh), 328 319 HANDLE_FUNC(__kvm_tlb_flush_vmid), 329 320 HANDLE_FUNC(__kvm_flush_cpu_context), 330 321 HANDLE_FUNC(__kvm_timer_set_cntvoff), ··· 385 374 386 375 handled = kvm_host_psci_handler(host_ctxt); 387 376 if (!handled) 377 + handled = kvm_host_ffa_handler(host_ctxt); 378 + if (!handled) 388 379 default_host_smc_handler(host_ctxt); 389 380 390 381 /* SMC was trapped, move ELR past the current PC. */ ··· 405 392 handle_host_smc(host_ctxt); 406 393 break; 407 394 case ESR_ELx_EC_SVE: 408 - sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0); 395 + if (has_hvhe()) 396 + sysreg_clear_set(cpacr_el1, 0, (CPACR_EL1_ZEN_EL1EN | 397 + CPACR_EL1_ZEN_EL0EN)); 398 + else 399 + sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0); 409 400 isb(); 410 401 sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); 411 402 break;

+71 -3

arch/arm64/kvm/hyp/nvhe/mem_protect.c

··· 91 91 hyp_put_page(&host_s2_pool, addr); 92 92 } 93 93 94 - static void host_s2_free_removed_table(void *addr, u32 level) 94 + static void host_s2_free_unlinked_table(void *addr, u32 level) 95 95 { 96 - kvm_pgtable_stage2_free_removed(&host_mmu.mm_ops, addr, level); 96 + kvm_pgtable_stage2_free_unlinked(&host_mmu.mm_ops, addr, level); 97 97 } 98 98 99 99 static int prepare_s2_pool(void *pgt_pool_base) ··· 110 110 host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) { 111 111 .zalloc_pages_exact = host_s2_zalloc_pages_exact, 112 112 .zalloc_page = host_s2_zalloc_page, 113 - .free_removed_table = host_s2_free_removed_table, 113 + .free_unlinked_table = host_s2_free_unlinked_table, 114 114 .phys_to_virt = hyp_phys_to_virt, 115 115 .virt_to_phys = hyp_virt_to_phys, 116 116 .page_count = hyp_page_count, ··· 842 842 case PKVM_ID_HYP: 843 843 ret = hyp_ack_share(completer_addr, tx, share->completer_prot); 844 844 break; 845 + case PKVM_ID_FFA: 846 + /* 847 + * We only check the host; the secure side will check the other 848 + * end when we forward the FFA call. 849 + */ 850 + ret = 0; 851 + break; 845 852 default: 846 853 ret = -EINVAL; 847 854 } ··· 876 869 switch (tx->completer.id) { 877 870 case PKVM_ID_HYP: 878 871 ret = hyp_complete_share(completer_addr, tx, share->completer_prot); 872 + break; 873 + case PKVM_ID_FFA: 874 + /* 875 + * We're not responsible for any secure page-tables, so there's 876 + * nothing to do here. 877 + */ 878 + ret = 0; 879 879 break; 880 880 default: 881 881 ret = -EINVAL; ··· 932 918 case PKVM_ID_HYP: 933 919 ret = hyp_ack_unshare(completer_addr, tx); 934 920 break; 921 + case PKVM_ID_FFA: 922 + /* See check_share() */ 923 + ret = 0; 924 + break; 935 925 default: 936 926 ret = -EINVAL; 937 927 } ··· 963 945 switch (tx->completer.id) { 964 946 case PKVM_ID_HYP: 965 947 ret = hyp_complete_unshare(completer_addr, tx); 948 + break; 949 + case PKVM_ID_FFA: 950 + /* See __do_share() */ 951 + ret = 0; 966 952 break; 967 953 default: 968 954 ret = -EINVAL; ··· 1256 1234 1257 1235 hyp_unlock_component(); 1258 1236 host_unlock_component(); 1237 + } 1238 + 1239 + int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages) 1240 + { 1241 + int ret; 1242 + struct pkvm_mem_share share = { 1243 + .tx = { 1244 + .nr_pages = nr_pages, 1245 + .initiator = { 1246 + .id = PKVM_ID_HOST, 1247 + .addr = hyp_pfn_to_phys(pfn), 1248 + }, 1249 + .completer = { 1250 + .id = PKVM_ID_FFA, 1251 + }, 1252 + }, 1253 + }; 1254 + 1255 + host_lock_component(); 1256 + ret = do_share(&share); 1257 + host_unlock_component(); 1258 + 1259 + return ret; 1260 + } 1261 + 1262 + int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages) 1263 + { 1264 + int ret; 1265 + struct pkvm_mem_share share = { 1266 + .tx = { 1267 + .nr_pages = nr_pages, 1268 + .initiator = { 1269 + .id = PKVM_ID_HOST, 1270 + .addr = hyp_pfn_to_phys(pfn), 1271 + }, 1272 + .completer = { 1273 + .id = PKVM_ID_FFA, 1274 + }, 1275 + }, 1276 + }; 1277 + 1278 + host_lock_component(); 1279 + ret = do_unshare(&share); 1280 + host_unlock_component(); 1281 + 1282 + return ret; 1259 1283 }

+21 -6

arch/arm64/kvm/hyp/nvhe/pkvm.c

··· 27 27 u64 hcr_set = HCR_RW; 28 28 u64 hcr_clear = 0; 29 29 u64 cptr_set = 0; 30 + u64 cptr_clear = 0; 30 31 31 32 /* Protected KVM does not support AArch32 guests. */ 32 33 BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), ··· 44 43 BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD), 45 44 PVM_ID_AA64PFR0_ALLOW)); 46 45 46 + if (has_hvhe()) 47 + hcr_set |= HCR_E2H; 48 + 47 49 /* Trap RAS unless all current versions are supported */ 48 50 if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), feature_ids) < 49 51 ID_AA64PFR0_EL1_RAS_V1P1) { ··· 61 57 } 62 58 63 59 /* Trap SVE */ 64 - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) 65 - cptr_set |= CPTR_EL2_TZ; 60 + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) { 61 + if (has_hvhe()) 62 + cptr_clear |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; 63 + else 64 + cptr_set |= CPTR_EL2_TZ; 65 + } 66 66 67 67 vcpu->arch.hcr_el2 |= hcr_set; 68 68 vcpu->arch.hcr_el2 &= ~hcr_clear; 69 69 vcpu->arch.cptr_el2 |= cptr_set; 70 + vcpu->arch.cptr_el2 &= ~cptr_clear; 70 71 } 71 72 72 73 /* ··· 129 120 mdcr_set |= MDCR_EL2_TTRF; 130 121 131 122 /* Trap Trace */ 132 - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) 133 - cptr_set |= CPTR_EL2_TTA; 123 + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) { 124 + if (has_hvhe()) 125 + cptr_set |= CPACR_EL1_TTA; 126 + else 127 + cptr_set |= CPTR_EL2_TTA; 128 + } 134 129 135 130 vcpu->arch.mdcr_el2 |= mdcr_set; 136 131 vcpu->arch.mdcr_el2 &= ~mdcr_clear; ··· 189 176 /* Clear res0 and set res1 bits to trap potential new features. */ 190 177 vcpu->arch.hcr_el2 &= ~(HCR_RES0); 191 178 vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0); 192 - vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1; 193 - vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0); 179 + if (!has_hvhe()) { 180 + vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1; 181 + vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0); 182 + } 194 183 } 195 184 196 185 /*

+11

arch/arm64/kvm/hyp/nvhe/setup.c

··· 11 11 #include <asm/kvm_pkvm.h> 12 12 13 13 #include <nvhe/early_alloc.h> 14 + #include <nvhe/ffa.h> 14 15 #include <nvhe/fixed_config.h> 15 16 #include <nvhe/gfp.h> 16 17 #include <nvhe/memory.h> ··· 29 28 static void *vm_table_base; 30 29 static void *hyp_pgt_base; 31 30 static void *host_s2_pgt_base; 31 + static void *ffa_proxy_pages; 32 32 static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; 33 33 static struct hyp_pool hpool; 34 34 ··· 57 55 nr_pages = host_s2_pgtable_pages(); 58 56 host_s2_pgt_base = hyp_early_alloc_contig(nr_pages); 59 57 if (!host_s2_pgt_base) 58 + return -ENOMEM; 59 + 60 + nr_pages = hyp_ffa_proxy_pages(); 61 + ffa_proxy_pages = hyp_early_alloc_contig(nr_pages); 62 + if (!ffa_proxy_pages) 60 63 return -ENOMEM; 61 64 62 65 return 0; ··· 318 311 goto out; 319 312 320 313 ret = hyp_create_pcpu_fixmap(); 314 + if (ret) 315 + goto out; 316 + 317 + ret = hyp_ffa_init(ffa_proxy_pages); 321 318 if (ret) 322 319 goto out; 323 320

+16 -12

arch/arm64/kvm/hyp/nvhe/switch.c

··· 44 44 __activate_traps_common(vcpu); 45 45 46 46 val = vcpu->arch.cptr_el2; 47 - val |= CPTR_EL2_TTA | CPTR_EL2_TAM; 47 + val |= CPTR_EL2_TAM; /* Same bit irrespective of E2H */ 48 + val |= has_hvhe() ? CPACR_EL1_TTA : CPTR_EL2_TTA; 49 + if (cpus_have_final_cap(ARM64_SME)) { 50 + if (has_hvhe()) 51 + val &= ~(CPACR_EL1_SMEN_EL1EN | CPACR_EL1_SMEN_EL0EN); 52 + else 53 + val |= CPTR_EL2_TSM; 54 + } 55 + 48 56 if (!guest_owns_fp_regs(vcpu)) { 49 - val |= CPTR_EL2_TFP | CPTR_EL2_TZ; 57 + if (has_hvhe()) 58 + val &= ~(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN | 59 + CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN); 60 + else 61 + val |= CPTR_EL2_TFP | CPTR_EL2_TZ; 62 + 50 63 __activate_traps_fpsimd32(vcpu); 51 64 } 52 - if (cpus_have_final_cap(ARM64_SME)) 53 - val |= CPTR_EL2_TSM; 54 65 55 66 write_sysreg(val, cptr_el2); 56 67 write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2); ··· 84 73 static void __deactivate_traps(struct kvm_vcpu *vcpu) 85 74 { 86 75 extern char __kvm_hyp_host_vector[]; 87 - u64 cptr; 88 76 89 77 ___deactivate_traps(vcpu); 90 78 ··· 108 98 109 99 write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); 110 100 111 - cptr = CPTR_EL2_DEFAULT; 112 - if (vcpu_has_sve(vcpu) && (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED)) 113 - cptr |= CPTR_EL2_TZ; 114 - if (cpus_have_final_cap(ARM64_SME)) 115 - cptr &= ~CPTR_EL2_TSM; 116 - 117 - write_sysreg(cptr, cptr_el2); 101 + kvm_reset_cptr_el2(vcpu); 118 102 write_sysreg(__kvm_hyp_host_vector, vbar_el2); 119 103 } 120 104

+12 -4

arch/arm64/kvm/hyp/nvhe/timer-sr.c

··· 17 17 } 18 18 19 19 /* 20 - * Should only be called on non-VHE systems. 20 + * Should only be called on non-VHE or hVHE setups. 21 21 * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe(). 22 22 */ 23 23 void __timer_disable_traps(struct kvm_vcpu *vcpu) 24 24 { 25 - u64 val; 25 + u64 val, shift = 0; 26 + 27 + if (has_hvhe()) 28 + shift = 10; 26 29 27 30 /* Allow physical timer/counter access for the host */ 28 31 val = read_sysreg(cnthctl_el2); 29 - val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN; 32 + val |= (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift; 30 33 write_sysreg(val, cnthctl_el2); 31 34 } 32 35 33 36 /* 34 - * Should only be called on non-VHE systems. 37 + * Should only be called on non-VHE or hVHE setups. 35 38 * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe(). 36 39 */ 37 40 void __timer_enable_traps(struct kvm_vcpu *vcpu) ··· 52 49 set |= CNTHCTL_EL1PCTEN; 53 50 else 54 51 clr |= CNTHCTL_EL1PCTEN; 52 + 53 + if (has_hvhe()) { 54 + clr <<= 10; 55 + set <<= 10; 56 + } 55 57 56 58 sysreg_clear_set(cnthctl_el2, clr, set); 57 59 }

+52

arch/arm64/kvm/hyp/nvhe/tlb.c

··· 130 130 __tlb_switch_to_host(&cxt); 131 131 } 132 132 133 + void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, 134 + phys_addr_t ipa, int level) 135 + { 136 + struct tlb_inv_context cxt; 137 + 138 + /* Switch to requested VMID */ 139 + __tlb_switch_to_guest(mmu, &cxt, true); 140 + 141 + /* 142 + * We could do so much better if we had the VA as well. 143 + * Instead, we invalidate Stage-2 for this IPA, and the 144 + * whole of Stage-1. Weep... 145 + */ 146 + ipa >>= 12; 147 + __tlbi_level(ipas2e1, ipa, level); 148 + 149 + /* 150 + * We have to ensure completion of the invalidation at Stage-2, 151 + * since a table walk on another CPU could refill a TLB with a 152 + * complete (S1 + S2) walk based on the old Stage-2 mapping if 153 + * the Stage-1 invalidation happened first. 154 + */ 155 + dsb(nsh); 156 + __tlbi(vmalle1); 157 + dsb(nsh); 158 + isb(); 159 + 160 + /* 161 + * If the host is running at EL1 and we have a VPIPT I-cache, 162 + * then we must perform I-cache maintenance at EL2 in order for 163 + * it to have an effect on the guest. Since the guest cannot hit 164 + * I-cache lines allocated with a different VMID, we don't need 165 + * to worry about junk out of guest reset (we nuke the I-cache on 166 + * VMID rollover), but we do need to be careful when remapping 167 + * executable pages for the same guest. This can happen when KSM 168 + * takes a CoW fault on an executable page, copies the page into 169 + * a page that was previously mapped in the guest and then needs 170 + * to invalidate the guest view of the I-cache for that page 171 + * from EL1. To solve this, we invalidate the entire I-cache when 172 + * unmapping a page from a guest if we have a VPIPT I-cache but 173 + * the host is running at EL1. As above, we could do better if 174 + * we had the VA. 175 + * 176 + * The moral of this story is: if you have a VPIPT I-cache, then 177 + * you should be running with VHE enabled. 178 + */ 179 + if (icache_is_vpipt()) 180 + icache_inval_all_pou(); 181 + 182 + __tlb_switch_to_host(&cxt); 183 + } 184 + 133 185 void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) 134 186 { 135 187 struct tlb_inv_context cxt;

+207 -21

arch/arm64/kvm/hyp/pgtable.c

··· 21 21 22 22 #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) 23 23 #define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) 24 - #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3 25 - #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1 24 + #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \ 25 + ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; }) 26 + #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \ 27 + ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; }) 26 28 #define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) 27 29 #define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 28 30 #define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) ··· 36 34 #define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 37 35 #define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) 38 36 39 - #define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) 37 + #define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50) 40 38 41 39 #define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) 42 40 43 41 #define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) 44 42 45 43 #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) 44 + 45 + #define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) 46 46 47 47 #define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ 48 48 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ ··· 66 62 u64 addr; 67 63 const u64 end; 68 64 }; 65 + 66 + static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx) 67 + { 68 + return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI); 69 + } 70 + 71 + static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx) 72 + { 73 + return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO); 74 + } 69 75 70 76 static bool kvm_phys_is_valid(u64 phys) 71 77 { ··· 400 386 401 387 if (device) 402 388 return -EINVAL; 389 + 390 + if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti()) 391 + attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP; 403 392 } else { 404 393 attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN; 405 394 } ··· 640 623 #ifdef CONFIG_ARM64_HW_AFDBM 641 624 /* 642 625 * Enable the Hardware Access Flag management, unconditionally 643 - * on all CPUs. The features is RES0 on CPUs without the support 644 - * and must be ignored by the CPUs. 626 + * on all CPUs. In systems that have asymmetric support for the feature 627 + * this allows KVM to leverage hardware support on the subset of cores 628 + * that implement the feature. 629 + * 630 + * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by 631 + * hardware) on implementations that do not advertise support for the 632 + * feature. As such, setting HA unconditionally is safe, unless you 633 + * happen to be running on a design that has unadvertised support for 634 + * HAFDBS. Here be dragons. 645 635 */ 646 - vtcr |= VTCR_EL2_HA; 636 + if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) 637 + vtcr |= VTCR_EL2_HA; 647 638 #endif /* CONFIG_ARM64_HW_AFDBM */ 648 639 649 640 /* Set the vmid bits */ ··· 780 755 if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED)) 781 756 return false; 782 757 783 - /* 784 - * Perform the appropriate TLB invalidation based on the evicted pte 785 - * value (if any). 786 - */ 787 - if (kvm_pte_table(ctx->old, ctx->level)) 788 - kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); 789 - else if (kvm_pte_valid(ctx->old)) 790 - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); 758 + if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) { 759 + /* 760 + * Perform the appropriate TLB invalidation based on the 761 + * evicted pte value (if any). 762 + */ 763 + if (kvm_pte_table(ctx->old, ctx->level)) 764 + kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); 765 + else if (kvm_pte_valid(ctx->old)) 766 + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, 767 + ctx->addr, ctx->level); 768 + } 791 769 792 770 if (stage2_pte_is_counted(ctx->old)) 793 771 mm_ops->put_page(ctx->ptep); ··· 897 869 return -EAGAIN; 898 870 899 871 /* Perform CMOs before installation of the guest stage-2 PTE */ 900 - if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new)) 872 + if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc && 873 + stage2_pte_cacheable(pgt, new)) 901 874 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops), 902 - granule); 875 + granule); 903 876 904 - if (mm_ops->icache_inval_pou && stage2_pte_executable(new)) 877 + if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou && 878 + stage2_pte_executable(new)) 905 879 mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); 906 880 907 881 stage2_make_pte(ctx, new); ··· 925 895 if (ret) 926 896 return ret; 927 897 928 - mm_ops->free_removed_table(childp, ctx->level); 898 + mm_ops->free_unlinked_table(childp, ctx->level); 929 899 return 0; 930 900 } 931 901 ··· 970 940 * The TABLE_PRE callback runs for table entries on the way down, looking 971 941 * for table entries which we could conceivably replace with a block entry 972 942 * for this mapping. If it finds one it replaces the entry and calls 973 - * kvm_pgtable_mm_ops::free_removed_table() to tear down the detached table. 943 + * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table. 974 944 * 975 945 * Otherwise, the LEAF callback performs the mapping at the existing leaves 976 946 * instead. ··· 1239 1209 KVM_PGTABLE_WALK_HANDLE_FAULT | 1240 1210 KVM_PGTABLE_WALK_SHARED); 1241 1211 if (!ret) 1242 - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level); 1212 + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level); 1243 1213 return ret; 1244 1214 } 1245 1215 ··· 1272 1242 return kvm_pgtable_walk(pgt, addr, size, &walker); 1273 1243 } 1274 1244 1245 + kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, 1246 + u64 phys, u32 level, 1247 + enum kvm_pgtable_prot prot, 1248 + void *mc, bool force_pte) 1249 + { 1250 + struct stage2_map_data map_data = { 1251 + .phys = phys, 1252 + .mmu = pgt->mmu, 1253 + .memcache = mc, 1254 + .force_pte = force_pte, 1255 + }; 1256 + struct kvm_pgtable_walker walker = { 1257 + .cb = stage2_map_walker, 1258 + .flags = KVM_PGTABLE_WALK_LEAF | 1259 + KVM_PGTABLE_WALK_SKIP_BBM_TLBI | 1260 + KVM_PGTABLE_WALK_SKIP_CMO, 1261 + .arg = &map_data, 1262 + }; 1263 + /* 1264 + * The input address (.addr) is irrelevant for walking an 1265 + * unlinked table. Construct an ambiguous IA range to map 1266 + * kvm_granule_size(level) worth of memory. 1267 + */ 1268 + struct kvm_pgtable_walk_data data = { 1269 + .walker = &walker, 1270 + .addr = 0, 1271 + .end = kvm_granule_size(level), 1272 + }; 1273 + struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; 1274 + kvm_pte_t *pgtable; 1275 + int ret; 1276 + 1277 + if (!IS_ALIGNED(phys, kvm_granule_size(level))) 1278 + return ERR_PTR(-EINVAL); 1279 + 1280 + ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); 1281 + if (ret) 1282 + return ERR_PTR(ret); 1283 + 1284 + pgtable = mm_ops->zalloc_page(mc); 1285 + if (!pgtable) 1286 + return ERR_PTR(-ENOMEM); 1287 + 1288 + ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable, 1289 + level + 1); 1290 + if (ret) { 1291 + kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level); 1292 + mm_ops->put_page(pgtable); 1293 + return ERR_PTR(ret); 1294 + } 1295 + 1296 + return pgtable; 1297 + } 1298 + 1299 + /* 1300 + * Get the number of page-tables needed to replace a block with a 1301 + * fully populated tree up to the PTE entries. Note that @level is 1302 + * interpreted as in "level @level entry". 1303 + */ 1304 + static int stage2_block_get_nr_page_tables(u32 level) 1305 + { 1306 + switch (level) { 1307 + case 1: 1308 + return PTRS_PER_PTE + 1; 1309 + case 2: 1310 + return 1; 1311 + case 3: 1312 + return 0; 1313 + default: 1314 + WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL || 1315 + level >= KVM_PGTABLE_MAX_LEVELS); 1316 + return -EINVAL; 1317 + }; 1318 + } 1319 + 1320 + static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, 1321 + enum kvm_pgtable_walk_flags visit) 1322 + { 1323 + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 1324 + struct kvm_mmu_memory_cache *mc = ctx->arg; 1325 + struct kvm_s2_mmu *mmu; 1326 + kvm_pte_t pte = ctx->old, new, *childp; 1327 + enum kvm_pgtable_prot prot; 1328 + u32 level = ctx->level; 1329 + bool force_pte; 1330 + int nr_pages; 1331 + u64 phys; 1332 + 1333 + /* No huge-pages exist at the last level */ 1334 + if (level == KVM_PGTABLE_MAX_LEVELS - 1) 1335 + return 0; 1336 + 1337 + /* We only split valid block mappings */ 1338 + if (!kvm_pte_valid(pte)) 1339 + return 0; 1340 + 1341 + nr_pages = stage2_block_get_nr_page_tables(level); 1342 + if (nr_pages < 0) 1343 + return nr_pages; 1344 + 1345 + if (mc->nobjs >= nr_pages) { 1346 + /* Build a tree mapped down to the PTE granularity. */ 1347 + force_pte = true; 1348 + } else { 1349 + /* 1350 + * Don't force PTEs, so create_unlinked() below does 1351 + * not populate the tree up to the PTE level. The 1352 + * consequence is that the call will require a single 1353 + * page of level 2 entries at level 1, or a single 1354 + * page of PTEs at level 2. If we are at level 1, the 1355 + * PTEs will be created recursively. 1356 + */ 1357 + force_pte = false; 1358 + nr_pages = 1; 1359 + } 1360 + 1361 + if (mc->nobjs < nr_pages) 1362 + return -ENOMEM; 1363 + 1364 + mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache); 1365 + phys = kvm_pte_to_phys(pte); 1366 + prot = kvm_pgtable_stage2_pte_prot(pte); 1367 + 1368 + childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys, 1369 + level, prot, mc, force_pte); 1370 + if (IS_ERR(childp)) 1371 + return PTR_ERR(childp); 1372 + 1373 + if (!stage2_try_break_pte(ctx, mmu)) { 1374 + kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level); 1375 + mm_ops->put_page(childp); 1376 + return -EAGAIN; 1377 + } 1378 + 1379 + /* 1380 + * Note, the contents of the page table are guaranteed to be made 1381 + * visible before the new PTE is assigned because stage2_make_pte() 1382 + * writes the PTE using smp_store_release(). 1383 + */ 1384 + new = kvm_init_table_pte(childp, mm_ops); 1385 + stage2_make_pte(ctx, new); 1386 + dsb(ishst); 1387 + return 0; 1388 + } 1389 + 1390 + int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, 1391 + struct kvm_mmu_memory_cache *mc) 1392 + { 1393 + struct kvm_pgtable_walker walker = { 1394 + .cb = stage2_split_walker, 1395 + .flags = KVM_PGTABLE_WALK_LEAF, 1396 + .arg = mc, 1397 + }; 1398 + 1399 + return kvm_pgtable_walk(pgt, addr, size, &walker); 1400 + } 1275 1401 1276 1402 int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, 1277 1403 struct kvm_pgtable_mm_ops *mm_ops, ··· 1497 1311 pgt->pgd = NULL; 1498 1312 } 1499 1313 1500 - void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level) 1314 + void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level) 1501 1315 { 1502 1316 kvm_pteref_t ptep = (kvm_pteref_t)pgtable; 1503 1317 struct kvm_pgtable_walker walker = {

+1 -1

arch/arm64/kvm/hyp/vhe/switch.c

··· 84 84 */ 85 85 asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); 86 86 87 - write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1); 87 + kvm_reset_cptr_el2(vcpu); 88 88 89 89 if (!arm64_kernel_unmapped_at_el0()) 90 90 host_vectors = __this_cpu_read(this_cpu_vector);

+32

arch/arm64/kvm/hyp/vhe/tlb.c

··· 111 111 __tlb_switch_to_host(&cxt); 112 112 } 113 113 114 + void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, 115 + phys_addr_t ipa, int level) 116 + { 117 + struct tlb_inv_context cxt; 118 + 119 + dsb(nshst); 120 + 121 + /* Switch to requested VMID */ 122 + __tlb_switch_to_guest(mmu, &cxt); 123 + 124 + /* 125 + * We could do so much better if we had the VA as well. 126 + * Instead, we invalidate Stage-2 for this IPA, and the 127 + * whole of Stage-1. Weep... 128 + */ 129 + ipa >>= 12; 130 + __tlbi_level(ipas2e1, ipa, level); 131 + 132 + /* 133 + * We have to ensure completion of the invalidation at Stage-2, 134 + * since a table walk on another CPU could refill a TLB with a 135 + * complete (S1 + S2) walk based on the old Stage-2 mapping if 136 + * the Stage-1 invalidation happened first. 137 + */ 138 + dsb(nsh); 139 + __tlbi(vmalle1); 140 + dsb(nsh); 141 + isb(); 142 + 143 + __tlb_switch_to_host(&cxt); 144 + } 145 + 114 146 void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) 115 147 { 116 148 struct tlb_inv_context cxt;

+174 -35

arch/arm64/kvm/mmu.c

··· 31 31 32 32 static unsigned long __ro_after_init io_map_base; 33 33 34 - static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) 34 + static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, 35 + phys_addr_t size) 35 36 { 36 - phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); 37 37 phys_addr_t boundary = ALIGN_DOWN(addr + size, size); 38 38 39 39 return (boundary - 1 < end - 1) ? boundary : end; 40 + } 41 + 42 + static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) 43 + { 44 + phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); 45 + 46 + return __stage2_range_addr_end(addr, end, size); 40 47 } 41 48 42 49 /* ··· 81 74 82 75 #define stage2_apply_range_resched(mmu, addr, end, fn) \ 83 76 stage2_apply_range(mmu, addr, end, fn, true) 77 + 78 + /* 79 + * Get the maximum number of page-tables pages needed to split a range 80 + * of blocks into PAGE_SIZE PTEs. It assumes the range is already 81 + * mapped at level 2, or at level 1 if allowed. 82 + */ 83 + static int kvm_mmu_split_nr_page_tables(u64 range) 84 + { 85 + int n = 0; 86 + 87 + if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) 88 + n += DIV_ROUND_UP(range, PUD_SIZE); 89 + n += DIV_ROUND_UP(range, PMD_SIZE); 90 + return n; 91 + } 92 + 93 + static bool need_split_memcache_topup_or_resched(struct kvm *kvm) 94 + { 95 + struct kvm_mmu_memory_cache *cache; 96 + u64 chunk_size, min; 97 + 98 + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) 99 + return true; 100 + 101 + chunk_size = kvm->arch.mmu.split_page_chunk_size; 102 + min = kvm_mmu_split_nr_page_tables(chunk_size); 103 + cache = &kvm->arch.mmu.split_page_cache; 104 + return kvm_mmu_memory_cache_nr_free_objects(cache) < min; 105 + } 106 + 107 + static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, 108 + phys_addr_t end) 109 + { 110 + struct kvm_mmu_memory_cache *cache; 111 + struct kvm_pgtable *pgt; 112 + int ret, cache_capacity; 113 + u64 next, chunk_size; 114 + 115 + lockdep_assert_held_write(&kvm->mmu_lock); 116 + 117 + chunk_size = kvm->arch.mmu.split_page_chunk_size; 118 + cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); 119 + 120 + if (chunk_size == 0) 121 + return 0; 122 + 123 + cache = &kvm->arch.mmu.split_page_cache; 124 + 125 + do { 126 + if (need_split_memcache_topup_or_resched(kvm)) { 127 + write_unlock(&kvm->mmu_lock); 128 + cond_resched(); 129 + /* Eager page splitting is best-effort. */ 130 + ret = __kvm_mmu_topup_memory_cache(cache, 131 + cache_capacity, 132 + cache_capacity); 133 + write_lock(&kvm->mmu_lock); 134 + if (ret) 135 + break; 136 + } 137 + 138 + pgt = kvm->arch.mmu.pgt; 139 + if (!pgt) 140 + return -EINVAL; 141 + 142 + next = __stage2_range_addr_end(addr, end, chunk_size); 143 + ret = kvm_pgtable_stage2_split(pgt, addr, next - addr, cache); 144 + if (ret) 145 + break; 146 + } while (addr = next, addr != end); 147 + 148 + return ret; 149 + } 84 150 85 151 static bool memslot_is_logging(struct kvm_memory_slot *memslot) 86 152 { ··· 211 131 212 132 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; 213 133 214 - static void stage2_free_removed_table_rcu_cb(struct rcu_head *head) 134 + static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) 215 135 { 216 136 struct page *page = container_of(head, struct page, rcu_head); 217 137 void *pgtable = page_to_virt(page); 218 138 u32 level = page_private(page); 219 139 220 - kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, pgtable, level); 140 + kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level); 221 141 } 222 142 223 - static void stage2_free_removed_table(void *addr, u32 level) 143 + static void stage2_free_unlinked_table(void *addr, u32 level) 224 144 { 225 145 struct page *page = virt_to_page(addr); 226 146 227 147 set_page_private(page, (unsigned long)level); 228 - call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb); 148 + call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); 229 149 } 230 150 231 151 static void kvm_host_get_page(void *addr) ··· 781 701 .zalloc_page = stage2_memcache_zalloc_page, 782 702 .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, 783 703 .free_pages_exact = kvm_s2_free_pages_exact, 784 - .free_removed_table = stage2_free_removed_table, 704 + .free_unlinked_table = stage2_free_unlinked_table, 785 705 .get_page = kvm_host_get_page, 786 706 .put_page = kvm_s2_put_page, 787 707 .page_count = kvm_host_page_count, ··· 855 775 for_each_possible_cpu(cpu) 856 776 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; 857 777 778 + /* The eager page splitting is disabled by default */ 779 + mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; 780 + mmu->split_page_cache.gfp_zero = __GFP_ZERO; 781 + 858 782 mmu->pgt = pgt; 859 783 mmu->pgd_phys = __pa(pgt->pgd); 860 784 return 0; ··· 868 784 out_free_pgtable: 869 785 kfree(pgt); 870 786 return err; 787 + } 788 + 789 + void kvm_uninit_stage2_mmu(struct kvm *kvm) 790 + { 791 + kvm_free_stage2_pgd(&kvm->arch.mmu); 792 + kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 871 793 } 872 794 873 795 static void stage2_unmap_memslot(struct kvm *kvm, ··· 1079 989 } 1080 990 1081 991 /** 1082 - * kvm_mmu_write_protect_pt_masked() - write protect dirty pages 992 + * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE 993 + * pages for memory slot 994 + * @kvm: The KVM pointer 995 + * @slot: The memory slot to split 996 + * 997 + * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, 998 + * serializing operations for VM memory regions. 999 + */ 1000 + static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) 1001 + { 1002 + struct kvm_memslots *slots; 1003 + struct kvm_memory_slot *memslot; 1004 + phys_addr_t start, end; 1005 + 1006 + lockdep_assert_held(&kvm->slots_lock); 1007 + 1008 + slots = kvm_memslots(kvm); 1009 + memslot = id_to_memslot(slots, slot); 1010 + 1011 + start = memslot->base_gfn << PAGE_SHIFT; 1012 + end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1013 + 1014 + write_lock(&kvm->mmu_lock); 1015 + kvm_mmu_split_huge_pages(kvm, start, end); 1016 + write_unlock(&kvm->mmu_lock); 1017 + } 1018 + 1019 + /* 1020 + * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. 1083 1021 * @kvm: The KVM pointer 1084 1022 * @slot: The memory slot associated with mask 1085 1023 * @gfn_offset: The gfn offset in memory slot 1086 - * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory 1087 - * slot to be write protected 1024 + * @mask: The mask of pages at offset 'gfn_offset' in this memory 1025 + * slot to enable dirty logging on 1088 1026 * 1089 - * Walks bits set in mask write protects the associated pte's. Caller must 1090 - * acquire kvm_mmu_lock. 1027 + * Writes protect selected pages to enable dirty logging, and then 1028 + * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. 1091 1029 */ 1092 - static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 1030 + void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1093 1031 struct kvm_memory_slot *slot, 1094 1032 gfn_t gfn_offset, unsigned long mask) 1095 1033 { ··· 1125 1007 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1126 1008 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1127 1009 1128 - stage2_wp_range(&kvm->arch.mmu, start, end); 1129 - } 1010 + lockdep_assert_held_write(&kvm->mmu_lock); 1130 1011 1131 - /* 1132 - * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected 1133 - * dirty pages. 1134 - * 1135 - * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to 1136 - * enable dirty logging for them. 1137 - */ 1138 - void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1139 - struct kvm_memory_slot *slot, 1140 - gfn_t gfn_offset, unsigned long mask) 1141 - { 1142 - kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); 1012 + stage2_wp_range(&kvm->arch.mmu, start, end); 1013 + 1014 + /* 1015 + * Eager-splitting is done when manual-protect is set. We 1016 + * also check for initially-all-set because we can avoid 1017 + * eager-splitting if initially-all-set is false. 1018 + * Initially-all-set equal false implies that huge-pages were 1019 + * already split when enabling dirty logging: no need to do it 1020 + * again. 1021 + */ 1022 + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1023 + kvm_mmu_split_huge_pages(kvm, start, end); 1143 1024 } 1144 1025 1145 1026 static void kvm_send_hwpoison_signal(unsigned long address, short lsb) ··· 1907 1790 const struct kvm_memory_slot *new, 1908 1791 enum kvm_mr_change change) 1909 1792 { 1793 + bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; 1794 + 1910 1795 /* 1911 1796 * At this point memslot has been committed and there is an 1912 1797 * allocated dirty_bitmap[], dirty pages will be tracked while the 1913 1798 * memory slot is write protected. 1914 1799 */ 1915 - if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) { 1800 + if (log_dirty_pages) { 1801 + 1802 + if (change == KVM_MR_DELETE) 1803 + return; 1804 + 1916 1805 /* 1917 - * If we're with initial-all-set, we don't need to write 1918 - * protect any pages because they're all reported as dirty. 1919 - * Huge pages and normal pages will be write protect gradually. 1806 + * Huge and normal pages are write-protected and split 1807 + * on either of these two cases: 1808 + * 1809 + * 1. with initial-all-set: gradually with CLEAR ioctls, 1920 1810 */ 1921 - if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) { 1922 - kvm_mmu_wp_memory_region(kvm, new->id); 1923 - } 1811 + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1812 + return; 1813 + /* 1814 + * or 1815 + * 2. without initial-all-set: all in one shot when 1816 + * enabling dirty logging. 1817 + */ 1818 + kvm_mmu_wp_memory_region(kvm, new->id); 1819 + kvm_mmu_split_memory_region(kvm, new->id); 1820 + } else { 1821 + /* 1822 + * Free any leftovers from the eager page splitting cache. Do 1823 + * this when deleting, moving, disabling dirty logging, or 1824 + * creating the memslot (a nop). Doing it for deletes makes 1825 + * sure we don't leak memory, and there's no need to keep the 1826 + * cache around for any of the other cases. 1827 + */ 1828 + kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 1924 1829 } 1925 1830 } 1926 1831 ··· 2016 1877 2017 1878 void kvm_arch_flush_shadow_all(struct kvm *kvm) 2018 1879 { 2019 - kvm_free_stage2_pgd(&kvm->arch.mmu); 1880 + kvm_uninit_stage2_mmu(kvm); 2020 1881 } 2021 1882 2022 1883 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,

+1

arch/arm64/kvm/pkvm.c

··· 78 78 hyp_mem_pages += host_s2_pgtable_pages(); 79 79 hyp_mem_pages += hyp_vm_table_pages(); 80 80 hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE); 81 + hyp_mem_pages += hyp_ffa_proxy_pages(); 81 82 82 83 /* 83 84 * Try to allocate a PMD-aligned region to reduce TLB pressure once

-58

arch/arm64/kvm/reset.c

··· 187 187 } 188 188 189 189 /** 190 - * kvm_set_vm_width() - set the register width for the guest 191 - * @vcpu: Pointer to the vcpu being configured 192 - * 193 - * Set both KVM_ARCH_FLAG_EL1_32BIT and KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED 194 - * in the VM flags based on the vcpu's requested register width, the HW 195 - * capabilities and other options (such as MTE). 196 - * When REG_WIDTH_CONFIGURED is already set, the vcpu settings must be 197 - * consistent with the value of the FLAG_EL1_32BIT bit in the flags. 198 - * 199 - * Return: 0 on success, negative error code on failure. 200 - */ 201 - static int kvm_set_vm_width(struct kvm_vcpu *vcpu) 202 - { 203 - struct kvm *kvm = vcpu->kvm; 204 - bool is32bit; 205 - 206 - is32bit = vcpu_has_feature(vcpu, KVM_ARM_VCPU_EL1_32BIT); 207 - 208 - lockdep_assert_held(&kvm->arch.config_lock); 209 - 210 - if (test_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags)) { 211 - /* 212 - * The guest's register width is already configured. 213 - * Make sure that the vcpu is consistent with it. 214 - */ 215 - if (is32bit == test_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags)) 216 - return 0; 217 - 218 - return -EINVAL; 219 - } 220 - 221 - if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1) && is32bit) 222 - return -EINVAL; 223 - 224 - /* MTE is incompatible with AArch32 */ 225 - if (kvm_has_mte(kvm) && is32bit) 226 - return -EINVAL; 227 - 228 - /* NV is incompatible with AArch32 */ 229 - if (vcpu_has_nv(vcpu) && is32bit) 230 - return -EINVAL; 231 - 232 - if (is32bit) 233 - set_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags); 234 - 235 - set_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags); 236 - 237 - return 0; 238 - } 239 - 240 - /** 241 190 * kvm_reset_vcpu - sets core registers and sys_regs to reset value 242 191 * @vcpu: The VCPU pointer 243 192 * ··· 210 261 int ret; 211 262 bool loaded; 212 263 u32 pstate; 213 - 214 - mutex_lock(&vcpu->kvm->arch.config_lock); 215 - ret = kvm_set_vm_width(vcpu); 216 - mutex_unlock(&vcpu->kvm->arch.config_lock); 217 - 218 - if (ret) 219 - return ret; 220 264 221 265 spin_lock(&vcpu->arch.mp_state_lock); 222 266 reset_state = vcpu->arch.reset_state;

+353 -152

arch/arm64/kvm/sys_regs.c

··· 42 42 */ 43 43 44 44 static u64 sys_reg_to_index(const struct sys_reg_desc *reg); 45 + static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, 46 + u64 val); 45 47 46 48 static bool read_from_write_only(struct kvm_vcpu *vcpu, 47 49 struct sys_reg_params *params, ··· 555 553 return 0; 556 554 } 557 555 558 - static void reset_bvr(struct kvm_vcpu *vcpu, 556 + static u64 reset_bvr(struct kvm_vcpu *vcpu, 559 557 const struct sys_reg_desc *rd) 560 558 { 561 559 vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = rd->val; 560 + return rd->val; 562 561 } 563 562 564 563 static bool trap_bcr(struct kvm_vcpu *vcpu, ··· 592 589 return 0; 593 590 } 594 591 595 - static void reset_bcr(struct kvm_vcpu *vcpu, 592 + static u64 reset_bcr(struct kvm_vcpu *vcpu, 596 593 const struct sys_reg_desc *rd) 597 594 { 598 595 vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = rd->val; 596 + return rd->val; 599 597 } 600 598 601 599 static bool trap_wvr(struct kvm_vcpu *vcpu, ··· 630 626 return 0; 631 627 } 632 628 633 - static void reset_wvr(struct kvm_vcpu *vcpu, 629 + static u64 reset_wvr(struct kvm_vcpu *vcpu, 634 630 const struct sys_reg_desc *rd) 635 631 { 636 632 vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = rd->val; 633 + return rd->val; 637 634 } 638 635 639 636 static bool trap_wcr(struct kvm_vcpu *vcpu, ··· 667 662 return 0; 668 663 } 669 664 670 - static void reset_wcr(struct kvm_vcpu *vcpu, 665 + static u64 reset_wcr(struct kvm_vcpu *vcpu, 671 666 const struct sys_reg_desc *rd) 672 667 { 673 668 vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = rd->val; 669 + return rd->val; 674 670 } 675 671 676 - static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 672 + static u64 reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 677 673 { 678 674 u64 amair = read_sysreg(amair_el1); 679 675 vcpu_write_sys_reg(vcpu, amair, AMAIR_EL1); 676 + return amair; 680 677 } 681 678 682 - static void reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 679 + static u64 reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 683 680 { 684 681 u64 actlr = read_sysreg(actlr_el1); 685 682 vcpu_write_sys_reg(vcpu, actlr, ACTLR_EL1); 683 + return actlr; 686 684 } 687 685 688 - static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 686 + static u64 reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 689 687 { 690 688 u64 mpidr; 691 689 ··· 702 694 mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0); 703 695 mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1); 704 696 mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2); 705 - vcpu_write_sys_reg(vcpu, (1ULL << 31) | mpidr, MPIDR_EL1); 697 + mpidr |= (1ULL << 31); 698 + vcpu_write_sys_reg(vcpu, mpidr, MPIDR_EL1); 699 + 700 + return mpidr; 706 701 } 707 702 708 703 static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu, ··· 717 706 return REG_HIDDEN; 718 707 } 719 708 720 - static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 709 + static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 721 710 { 722 711 u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX); 723 712 724 713 /* No PMU available, any PMU reg may UNDEF... */ 725 714 if (!kvm_arm_support_pmu_v3()) 726 - return; 715 + return 0; 727 716 728 717 n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT; 729 718 n &= ARMV8_PMU_PMCR_N_MASK; ··· 732 721 733 722 reset_unknown(vcpu, r); 734 723 __vcpu_sys_reg(vcpu, r->reg) &= mask; 724 + 725 + return __vcpu_sys_reg(vcpu, r->reg); 735 726 } 736 727 737 - static void reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 728 + static u64 reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 738 729 { 739 730 reset_unknown(vcpu, r); 740 731 __vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0); 732 + 733 + return __vcpu_sys_reg(vcpu, r->reg); 741 734 } 742 735 743 - static void reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 736 + static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 744 737 { 745 738 reset_unknown(vcpu, r); 746 739 __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_EVTYPE_MASK; 740 + 741 + return __vcpu_sys_reg(vcpu, r->reg); 747 742 } 748 743 749 - static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 744 + static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 750 745 { 751 746 reset_unknown(vcpu, r); 752 747 __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK; 748 + 749 + return __vcpu_sys_reg(vcpu, r->reg); 753 750 } 754 751 755 - static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 752 + static u64 reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 756 753 { 757 754 u64 pmcr; 758 755 759 756 /* No PMU available, PMCR_EL0 may UNDEF... */ 760 757 if (!kvm_arm_support_pmu_v3()) 761 - return; 758 + return 0; 762 759 763 760 /* Only preserve PMCR_EL0.N, and reset the rest to 0 */ 764 761 pmcr = read_sysreg(pmcr_el0) & (ARMV8_PMU_PMCR_N_MASK << ARMV8_PMU_PMCR_N_SHIFT); ··· 774 755 pmcr |= ARMV8_PMU_PMCR_LC; 775 756 776 757 __vcpu_sys_reg(vcpu, r->reg) = pmcr; 758 + 759 + return __vcpu_sys_reg(vcpu, r->reg); 777 760 } 778 761 779 762 static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags) ··· 1208 1187 return true; 1209 1188 } 1210 1189 1211 - static u8 vcpu_pmuver(const struct kvm_vcpu *vcpu) 1190 + static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp, 1191 + s64 new, s64 cur) 1212 1192 { 1213 - if (kvm_vcpu_has_pmu(vcpu)) 1214 - return vcpu->kvm->arch.dfr0_pmuver.imp; 1193 + struct arm64_ftr_bits kvm_ftr = *ftrp; 1215 1194 1216 - return vcpu->kvm->arch.dfr0_pmuver.unimp; 1195 + /* Some features have different safe value type in KVM than host features */ 1196 + switch (id) { 1197 + case SYS_ID_AA64DFR0_EL1: 1198 + if (kvm_ftr.shift == ID_AA64DFR0_EL1_PMUVer_SHIFT) 1199 + kvm_ftr.type = FTR_LOWER_SAFE; 1200 + break; 1201 + case SYS_ID_DFR0_EL1: 1202 + if (kvm_ftr.shift == ID_DFR0_EL1_PerfMon_SHIFT) 1203 + kvm_ftr.type = FTR_LOWER_SAFE; 1204 + break; 1205 + } 1206 + 1207 + return arm64_ftr_safe_value(&kvm_ftr, new, cur); 1217 1208 } 1218 1209 1219 - static u8 perfmon_to_pmuver(u8 perfmon) 1210 + /** 1211 + * arm64_check_features() - Check if a feature register value constitutes 1212 + * a subset of features indicated by the idreg's KVM sanitised limit. 1213 + * 1214 + * This function will check if each feature field of @val is the "safe" value 1215 + * against idreg's KVM sanitised limit return from reset() callback. 1216 + * If a field value in @val is the same as the one in limit, it is always 1217 + * considered the safe value regardless For register fields that are not in 1218 + * writable, only the value in limit is considered the safe value. 1219 + * 1220 + * Return: 0 if all the fields are safe. Otherwise, return negative errno. 1221 + */ 1222 + static int arm64_check_features(struct kvm_vcpu *vcpu, 1223 + const struct sys_reg_desc *rd, 1224 + u64 val) 1220 1225 { 1221 - switch (perfmon) { 1222 - case ID_DFR0_EL1_PerfMon_PMUv3: 1223 - return ID_AA64DFR0_EL1_PMUVer_IMP; 1224 - case ID_DFR0_EL1_PerfMon_IMPDEF: 1225 - return ID_AA64DFR0_EL1_PMUVer_IMP_DEF; 1226 - default: 1227 - /* Anything ARMv8.1+ and NI have the same value. For now. */ 1228 - return perfmon; 1226 + const struct arm64_ftr_reg *ftr_reg; 1227 + const struct arm64_ftr_bits *ftrp = NULL; 1228 + u32 id = reg_to_encoding(rd); 1229 + u64 writable_mask = rd->val; 1230 + u64 limit = rd->reset(vcpu, rd); 1231 + u64 mask = 0; 1232 + 1233 + /* 1234 + * Hidden and unallocated ID registers may not have a corresponding 1235 + * struct arm64_ftr_reg. Of course, if the register is RAZ we know the 1236 + * only safe value is 0. 1237 + */ 1238 + if (sysreg_visible_as_raz(vcpu, rd)) 1239 + return val ? -E2BIG : 0; 1240 + 1241 + ftr_reg = get_arm64_ftr_reg(id); 1242 + if (!ftr_reg) 1243 + return -EINVAL; 1244 + 1245 + ftrp = ftr_reg->ftr_bits; 1246 + 1247 + for (; ftrp && ftrp->width; ftrp++) { 1248 + s64 f_val, f_lim, safe_val; 1249 + u64 ftr_mask; 1250 + 1251 + ftr_mask = arm64_ftr_mask(ftrp); 1252 + if ((ftr_mask & writable_mask) != ftr_mask) 1253 + continue; 1254 + 1255 + f_val = arm64_ftr_value(ftrp, val); 1256 + f_lim = arm64_ftr_value(ftrp, limit); 1257 + mask |= ftr_mask; 1258 + 1259 + if (f_val == f_lim) 1260 + safe_val = f_val; 1261 + else 1262 + safe_val = kvm_arm64_ftr_safe_value(id, ftrp, f_val, f_lim); 1263 + 1264 + if (safe_val != f_val) 1265 + return -E2BIG; 1229 1266 } 1267 + 1268 + /* For fields that are not writable, values in limit are the safe values. */ 1269 + if ((val & ~mask) != (limit & ~mask)) 1270 + return -E2BIG; 1271 + 1272 + return 0; 1230 1273 } 1231 1274 1232 1275 static u8 pmuver_to_perfmon(u8 pmuver) ··· 1307 1222 } 1308 1223 1309 1224 /* Read a sanitised cpufeature ID register by sys_reg_desc */ 1310 - static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r) 1225 + static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, 1226 + const struct sys_reg_desc *r) 1311 1227 { 1312 1228 u32 id = reg_to_encoding(r); 1313 1229 u64 val; ··· 1319 1233 val = read_sanitised_ftr_reg(id); 1320 1234 1321 1235 switch (id) { 1322 - case SYS_ID_AA64PFR0_EL1: 1323 - if (!vcpu_has_sve(vcpu)) 1324 - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE); 1325 - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU); 1326 - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2); 1327 - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2); 1328 - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3); 1329 - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3); 1330 - if (kvm_vgic_global_state.type == VGIC_V3) { 1331 - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC); 1332 - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), 1); 1333 - } 1334 - break; 1335 1236 case SYS_ID_AA64PFR1_EL1: 1336 1237 if (!kvm_has_mte(vcpu->kvm)) 1337 1238 val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); ··· 1339 1266 if (!cpus_have_final_cap(ARM64_HAS_WFXT)) 1340 1267 val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT); 1341 1268 break; 1342 - case SYS_ID_AA64DFR0_EL1: 1343 - /* Limit debug to ARMv8.0 */ 1344 - val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer); 1345 - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), 6); 1346 - /* Set PMUver to the required version */ 1347 - val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer); 1348 - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), 1349 - vcpu_pmuver(vcpu)); 1350 - /* Hide SPE from guests */ 1351 - val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer); 1352 - break; 1353 - case SYS_ID_DFR0_EL1: 1354 - val &= ~ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon); 1355 - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon), 1356 - pmuver_to_perfmon(vcpu_pmuver(vcpu))); 1357 - break; 1358 1269 case SYS_ID_AA64MMFR2_EL1: 1359 1270 val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; 1360 1271 break; ··· 1348 1291 } 1349 1292 1350 1293 return val; 1294 + } 1295 + 1296 + static u64 kvm_read_sanitised_id_reg(struct kvm_vcpu *vcpu, 1297 + const struct sys_reg_desc *r) 1298 + { 1299 + return __kvm_read_sanitised_id_reg(vcpu, r); 1300 + } 1301 + 1302 + static u64 read_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 1303 + { 1304 + return IDREG(vcpu->kvm, reg_to_encoding(r)); 1305 + } 1306 + 1307 + /* 1308 + * Return true if the register's (Op0, Op1, CRn, CRm, Op2) is 1309 + * (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8. 1310 + */ 1311 + static inline bool is_id_reg(u32 id) 1312 + { 1313 + return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 && 1314 + sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 && 1315 + sys_reg_CRm(id) < 8); 1351 1316 } 1352 1317 1353 1318 static unsigned int id_visibility(const struct kvm_vcpu *vcpu, ··· 1433 1354 return REG_HIDDEN; 1434 1355 } 1435 1356 1436 - static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, 1437 - const struct sys_reg_desc *rd, 1438 - u64 val) 1357 + static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, 1358 + const struct sys_reg_desc *rd) 1439 1359 { 1440 - u8 csv2, csv3; 1360 + u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); 1361 + 1362 + if (!vcpu_has_sve(vcpu)) 1363 + val &= ~ID_AA64PFR0_EL1_SVE_MASK; 1441 1364 1442 1365 /* 1443 - * Allow AA64PFR0_EL1.CSV2 to be set from userspace as long as 1444 - * it doesn't promise more than what is actually provided (the 1445 - * guest could otherwise be covered in ectoplasmic residue). 1366 + * The default is to expose CSV2 == 1 if the HW isn't affected. 1367 + * Although this is a per-CPU feature, we make it global because 1368 + * asymmetric systems are just a nuisance. 1369 + * 1370 + * Userspace can override this as long as it doesn't promise 1371 + * the impossible. 1446 1372 */ 1447 - csv2 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV2_SHIFT); 1448 - if (csv2 > 1 || 1449 - (csv2 && arm64_get_spectre_v2_state() != SPECTRE_UNAFFECTED)) 1450 - return -EINVAL; 1373 + if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) { 1374 + val &= ~ID_AA64PFR0_EL1_CSV2_MASK; 1375 + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV2, IMP); 1376 + } 1377 + if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) { 1378 + val &= ~ID_AA64PFR0_EL1_CSV3_MASK; 1379 + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV3, IMP); 1380 + } 1451 1381 1452 - /* Same thing for CSV3 */ 1453 - csv3 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV3_SHIFT); 1454 - if (csv3 > 1 || 1455 - (csv3 && arm64_get_meltdown_state() != SPECTRE_UNAFFECTED)) 1456 - return -EINVAL; 1382 + if (kvm_vgic_global_state.type == VGIC_V3) { 1383 + val &= ~ID_AA64PFR0_EL1_GIC_MASK; 1384 + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP); 1385 + } 1457 1386 1458 - /* We can only differ with CSV[23], and anything else is an error */ 1459 - val ^= read_id_reg(vcpu, rd); 1460 - val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | 1461 - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3)); 1462 - if (val) 1463 - return -EINVAL; 1387 + val &= ~ID_AA64PFR0_EL1_AMU_MASK; 1464 1388 1465 - vcpu->kvm->arch.pfr0_csv2 = csv2; 1466 - vcpu->kvm->arch.pfr0_csv3 = csv3; 1389 + return val; 1390 + } 1467 1391 1468 - return 0; 1392 + static u64 read_sanitised_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, 1393 + const struct sys_reg_desc *rd) 1394 + { 1395 + u64 val = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); 1396 + 1397 + /* Limit debug to ARMv8.0 */ 1398 + val &= ~ID_AA64DFR0_EL1_DebugVer_MASK; 1399 + val |= SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, DebugVer, IMP); 1400 + 1401 + /* 1402 + * Only initialize the PMU version if the vCPU was configured with one. 1403 + */ 1404 + val &= ~ID_AA64DFR0_EL1_PMUVer_MASK; 1405 + if (kvm_vcpu_has_pmu(vcpu)) 1406 + val |= SYS_FIELD_PREP(ID_AA64DFR0_EL1, PMUVer, 1407 + kvm_arm_pmu_get_pmuver_limit()); 1408 + 1409 + /* Hide SPE from guests */ 1410 + val &= ~ID_AA64DFR0_EL1_PMSVer_MASK; 1411 + 1412 + return val; 1469 1413 } 1470 1414 1471 1415 static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, 1472 1416 const struct sys_reg_desc *rd, 1473 1417 u64 val) 1474 1418 { 1475 - u8 pmuver, host_pmuver; 1476 - bool valid_pmu; 1477 - 1478 - host_pmuver = kvm_arm_pmu_get_pmuver_limit(); 1419 + u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val); 1479 1420 1480 1421 /* 1481 - * Allow AA64DFR0_EL1.PMUver to be set from userspace as long 1482 - * as it doesn't promise more than what the HW gives us. We 1483 - * allow an IMPDEF PMU though, only if no PMU is supported 1484 - * (KVM backward compatibility handling). 1422 + * Prior to commit 3d0dba5764b9 ("KVM: arm64: PMU: Move the 1423 + * ID_AA64DFR0_EL1.PMUver limit to VM creation"), KVM erroneously 1424 + * exposed an IMP_DEF PMU to userspace and the guest on systems w/ 1425 + * non-architectural PMUs. Of course, PMUv3 is the only game in town for 1426 + * PMU virtualization, so the IMP_DEF value was rather user-hostile. 1427 + * 1428 + * At minimum, we're on the hook to allow values that were given to 1429 + * userspace by KVM. Cover our tracks here and replace the IMP_DEF value 1430 + * with a more sensible NI. The value of an ID register changing under 1431 + * the nose of the guest is unfortunate, but is certainly no more 1432 + * surprising than an ill-guided PMU driver poking at impdef system 1433 + * registers that end in an UNDEF... 1485 1434 */ 1486 - pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), val); 1487 - if ((pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF && pmuver > host_pmuver)) 1488 - return -EINVAL; 1435 + if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) 1436 + val &= ~ID_AA64DFR0_EL1_PMUVer_MASK; 1489 1437 1490 - valid_pmu = (pmuver != 0 && pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF); 1438 + return set_id_reg(vcpu, rd, val); 1439 + } 1491 1440 1492 - /* Make sure view register and PMU support do match */ 1493 - if (kvm_vcpu_has_pmu(vcpu) != valid_pmu) 1494 - return -EINVAL; 1441 + static u64 read_sanitised_id_dfr0_el1(struct kvm_vcpu *vcpu, 1442 + const struct sys_reg_desc *rd) 1443 + { 1444 + u8 perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit()); 1445 + u64 val = read_sanitised_ftr_reg(SYS_ID_DFR0_EL1); 1495 1446 1496 - /* We can only differ with PMUver, and anything else is an error */ 1497 - val ^= read_id_reg(vcpu, rd); 1498 - val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer); 1499 - if (val) 1500 - return -EINVAL; 1447 + val &= ~ID_DFR0_EL1_PerfMon_MASK; 1448 + if (kvm_vcpu_has_pmu(vcpu)) 1449 + val |= SYS_FIELD_PREP(ID_DFR0_EL1, PerfMon, perfmon); 1501 1450 1502 - if (valid_pmu) 1503 - vcpu->kvm->arch.dfr0_pmuver.imp = pmuver; 1504 - else 1505 - vcpu->kvm->arch.dfr0_pmuver.unimp = pmuver; 1506 - 1507 - return 0; 1451 + return val; 1508 1452 } 1509 1453 1510 1454 static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, 1511 1455 const struct sys_reg_desc *rd, 1512 1456 u64 val) 1513 1457 { 1514 - u8 perfmon, host_perfmon; 1515 - bool valid_pmu; 1458 + u8 perfmon = SYS_FIELD_GET(ID_DFR0_EL1, PerfMon, val); 1516 1459 1517 - host_perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit()); 1460 + if (perfmon == ID_DFR0_EL1_PerfMon_IMPDEF) { 1461 + val &= ~ID_DFR0_EL1_PerfMon_MASK; 1462 + perfmon = 0; 1463 + } 1518 1464 1519 1465 /* 1520 1466 * Allow DFR0_EL1.PerfMon to be set from userspace as long as ··· 1547 1443 * AArch64 side (as everything is emulated with that), and 1548 1444 * that this is a PMUv3. 1549 1445 */ 1550 - perfmon = FIELD_GET(ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon), val); 1551 - if ((perfmon != ID_DFR0_EL1_PerfMon_IMPDEF && perfmon > host_perfmon) || 1552 - (perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3)) 1446 + if (perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3) 1553 1447 return -EINVAL; 1554 1448 1555 - valid_pmu = (perfmon != 0 && perfmon != ID_DFR0_EL1_PerfMon_IMPDEF); 1556 - 1557 - /* Make sure view register and PMU support do match */ 1558 - if (kvm_vcpu_has_pmu(vcpu) != valid_pmu) 1559 - return -EINVAL; 1560 - 1561 - /* We can only differ with PerfMon, and anything else is an error */ 1562 - val ^= read_id_reg(vcpu, rd); 1563 - val &= ~ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon); 1564 - if (val) 1565 - return -EINVAL; 1566 - 1567 - if (valid_pmu) 1568 - vcpu->kvm->arch.dfr0_pmuver.imp = perfmon_to_pmuver(perfmon); 1569 - else 1570 - vcpu->kvm->arch.dfr0_pmuver.unimp = perfmon_to_pmuver(perfmon); 1571 - 1572 - return 0; 1449 + return set_id_reg(vcpu, rd, val); 1573 1450 } 1574 1451 1575 1452 /* ··· 1563 1478 static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, 1564 1479 u64 *val) 1565 1480 { 1481 + /* 1482 + * Avoid locking if the VM has already started, as the ID registers are 1483 + * guaranteed to be invariant at that point. 1484 + */ 1485 + if (kvm_vm_has_ran_once(vcpu->kvm)) { 1486 + *val = read_id_reg(vcpu, rd); 1487 + return 0; 1488 + } 1489 + 1490 + mutex_lock(&vcpu->kvm->arch.config_lock); 1566 1491 *val = read_id_reg(vcpu, rd); 1492 + mutex_unlock(&vcpu->kvm->arch.config_lock); 1493 + 1567 1494 return 0; 1568 1495 } 1569 1496 1570 1497 static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, 1571 1498 u64 val) 1572 1499 { 1573 - /* This is what we mean by invariant: you can't change it. */ 1574 - if (val != read_id_reg(vcpu, rd)) 1575 - return -EINVAL; 1500 + u32 id = reg_to_encoding(rd); 1501 + int ret; 1576 1502 1577 - return 0; 1503 + mutex_lock(&vcpu->kvm->arch.config_lock); 1504 + 1505 + /* 1506 + * Once the VM has started the ID registers are immutable. Reject any 1507 + * write that does not match the final register value. 1508 + */ 1509 + if (kvm_vm_has_ran_once(vcpu->kvm)) { 1510 + if (val != read_id_reg(vcpu, rd)) 1511 + ret = -EBUSY; 1512 + else 1513 + ret = 0; 1514 + 1515 + mutex_unlock(&vcpu->kvm->arch.config_lock); 1516 + return ret; 1517 + } 1518 + 1519 + ret = arm64_check_features(vcpu, rd, val); 1520 + if (!ret) 1521 + IDREG(vcpu->kvm, id) = val; 1522 + 1523 + mutex_unlock(&vcpu->kvm->arch.config_lock); 1524 + 1525 + /* 1526 + * arm64_check_features() returns -E2BIG to indicate the register's 1527 + * feature set is a superset of the maximally-allowed register value. 1528 + * While it would be nice to precisely describe this to userspace, the 1529 + * existing UAPI for KVM_SET_ONE_REG has it that invalid register 1530 + * writes return -EINVAL. 1531 + */ 1532 + if (ret == -E2BIG) 1533 + ret = -EINVAL; 1534 + return ret; 1578 1535 } 1579 1536 1580 1537 static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, ··· 1656 1529 * Fabricate a CLIDR_EL1 value instead of using the real value, which can vary 1657 1530 * by the physical CPU which the vcpu currently resides in. 1658 1531 */ 1659 - static void reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 1532 + static u64 reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 1660 1533 { 1661 1534 u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0); 1662 1535 u64 clidr; ··· 1704 1577 clidr |= 2 << CLIDR_TTYPE_SHIFT(loc); 1705 1578 1706 1579 __vcpu_sys_reg(vcpu, r->reg) = clidr; 1580 + 1581 + return __vcpu_sys_reg(vcpu, r->reg); 1707 1582 } 1708 1583 1709 1584 static int set_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, ··· 1805 1676 .visibility = elx2_visibility, \ 1806 1677 } 1807 1678 1679 + /* 1680 + * Since reset() callback and field val are not used for idregs, they will be 1681 + * used for specific purposes for idregs. 1682 + * The reset() would return KVM sanitised register value. The value would be the 1683 + * same as the host kernel sanitised value if there is no KVM sanitisation. 1684 + * The val would be used as a mask indicating writable fields for the idreg. 1685 + * Only bits with 1 are writable from userspace. This mask might not be 1686 + * necessary in the future whenever all ID registers are enabled as writable 1687 + * from userspace. 1688 + */ 1689 + 1808 1690 /* sys_reg_desc initialiser for known cpufeature ID registers */ 1809 1691 #define ID_SANITISED(name) { \ 1810 1692 SYS_DESC(SYS_##name), \ ··· 1823 1683 .get_user = get_id_reg, \ 1824 1684 .set_user = set_id_reg, \ 1825 1685 .visibility = id_visibility, \ 1686 + .reset = kvm_read_sanitised_id_reg, \ 1687 + .val = 0, \ 1826 1688 } 1827 1689 1828 1690 /* sys_reg_desc initialiser for known cpufeature ID registers */ ··· 1834 1692 .get_user = get_id_reg, \ 1835 1693 .set_user = set_id_reg, \ 1836 1694 .visibility = aa32_id_visibility, \ 1695 + .reset = kvm_read_sanitised_id_reg, \ 1696 + .val = 0, \ 1837 1697 } 1838 1698 1839 1699 /* ··· 1848 1704 .access = access_id_reg, \ 1849 1705 .get_user = get_id_reg, \ 1850 1706 .set_user = set_id_reg, \ 1851 - .visibility = raz_visibility \ 1707 + .visibility = raz_visibility, \ 1708 + .reset = kvm_read_sanitised_id_reg, \ 1709 + .val = 0, \ 1852 1710 } 1853 1711 1854 1712 /* ··· 1864 1718 .get_user = get_id_reg, \ 1865 1719 .set_user = set_id_reg, \ 1866 1720 .visibility = raz_visibility, \ 1721 + .reset = kvm_read_sanitised_id_reg, \ 1722 + .val = 0, \ 1867 1723 } 1868 1724 1869 1725 static bool access_sp_el1(struct kvm_vcpu *vcpu, ··· 1973 1825 /* CRm=1 */ 1974 1826 AA32_ID_SANITISED(ID_PFR0_EL1), 1975 1827 AA32_ID_SANITISED(ID_PFR1_EL1), 1976 - { SYS_DESC(SYS_ID_DFR0_EL1), .access = access_id_reg, 1977 - .get_user = get_id_reg, .set_user = set_id_dfr0_el1, 1978 - .visibility = aa32_id_visibility, }, 1828 + { SYS_DESC(SYS_ID_DFR0_EL1), 1829 + .access = access_id_reg, 1830 + .get_user = get_id_reg, 1831 + .set_user = set_id_dfr0_el1, 1832 + .visibility = aa32_id_visibility, 1833 + .reset = read_sanitised_id_dfr0_el1, 1834 + .val = ID_DFR0_EL1_PerfMon_MASK, }, 1979 1835 ID_HIDDEN(ID_AFR0_EL1), 1980 1836 AA32_ID_SANITISED(ID_MMFR0_EL1), 1981 1837 AA32_ID_SANITISED(ID_MMFR1_EL1), ··· 2008 1856 2009 1857 /* AArch64 ID registers */ 2010 1858 /* CRm=4 */ 2011 - { SYS_DESC(SYS_ID_AA64PFR0_EL1), .access = access_id_reg, 2012 - .get_user = get_id_reg, .set_user = set_id_aa64pfr0_el1, }, 1859 + { SYS_DESC(SYS_ID_AA64PFR0_EL1), 1860 + .access = access_id_reg, 1861 + .get_user = get_id_reg, 1862 + .set_user = set_id_reg, 1863 + .reset = read_sanitised_id_aa64pfr0_el1, 1864 + .val = ID_AA64PFR0_EL1_CSV2_MASK | ID_AA64PFR0_EL1_CSV3_MASK, }, 2013 1865 ID_SANITISED(ID_AA64PFR1_EL1), 2014 1866 ID_UNALLOCATED(4,2), 2015 1867 ID_UNALLOCATED(4,3), ··· 2023 1867 ID_UNALLOCATED(4,7), 2024 1868 2025 1869 /* CRm=5 */ 2026 - { SYS_DESC(SYS_ID_AA64DFR0_EL1), .access = access_id_reg, 2027 - .get_user = get_id_reg, .set_user = set_id_aa64dfr0_el1, }, 1870 + { SYS_DESC(SYS_ID_AA64DFR0_EL1), 1871 + .access = access_id_reg, 1872 + .get_user = get_id_reg, 1873 + .set_user = set_id_aa64dfr0_el1, 1874 + .reset = read_sanitised_id_aa64dfr0_el1, 1875 + .val = ID_AA64DFR0_EL1_PMUVer_MASK, }, 2028 1876 ID_SANITISED(ID_AA64DFR1_EL1), 2029 1877 ID_UNALLOCATED(5,2), 2030 1878 ID_UNALLOCATED(5,3), ··· 2359 2199 EL2_REG(ACTLR_EL2, access_rw, reset_val, 0), 2360 2200 EL2_REG(HCR_EL2, access_rw, reset_val, 0), 2361 2201 EL2_REG(MDCR_EL2, access_rw, reset_val, 0), 2362 - EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_EL2_DEFAULT ), 2202 + EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1), 2363 2203 EL2_REG(HSTR_EL2, access_rw, reset_val, 0), 2364 2204 EL2_REG(HACR_EL2, access_rw, reset_val, 0), 2365 2205 ··· 2415 2255 2416 2256 EL2_REG(SP_EL2, NULL, reset_unknown, 0), 2417 2257 }; 2258 + 2259 + static const struct sys_reg_desc *first_idreg; 2418 2260 2419 2261 static bool trap_dbgdidr(struct kvm_vcpu *vcpu, 2420 2262 struct sys_reg_params *p, ··· 3108 2946 return false; 3109 2947 } 3110 2948 2949 + static void kvm_reset_id_regs(struct kvm_vcpu *vcpu) 2950 + { 2951 + const struct sys_reg_desc *idreg = first_idreg; 2952 + u32 id = reg_to_encoding(idreg); 2953 + struct kvm *kvm = vcpu->kvm; 2954 + 2955 + if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) 2956 + return; 2957 + 2958 + lockdep_assert_held(&kvm->arch.config_lock); 2959 + 2960 + /* Initialize all idregs */ 2961 + while (is_id_reg(id)) { 2962 + IDREG(kvm, id) = idreg->reset(vcpu, idreg); 2963 + 2964 + idreg++; 2965 + id = reg_to_encoding(idreg); 2966 + } 2967 + 2968 + set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); 2969 + } 2970 + 3111 2971 /** 3112 2972 * kvm_reset_sys_regs - sets system registers to reset value 3113 2973 * @vcpu: The VCPU pointer ··· 3141 2957 { 3142 2958 unsigned long i; 3143 2959 3144 - for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) 3145 - if (sys_reg_descs[i].reset) 3146 - sys_reg_descs[i].reset(vcpu, &sys_reg_descs[i]); 2960 + kvm_reset_id_regs(vcpu); 2961 + 2962 + for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) { 2963 + const struct sys_reg_desc *r = &sys_reg_descs[i]; 2964 + 2965 + if (is_id_reg(reg_to_encoding(r))) 2966 + continue; 2967 + 2968 + if (r->reset) 2969 + r->reset(vcpu, r); 2970 + } 3147 2971 } 3148 2972 3149 2973 /** ··· 3252 3060 */ 3253 3061 3254 3062 #define FUNCTION_INVARIANT(reg) \ 3255 - static void get_##reg(struct kvm_vcpu *v, \ 3063 + static u64 get_##reg(struct kvm_vcpu *v, \ 3256 3064 const struct sys_reg_desc *r) \ 3257 3065 { \ 3258 3066 ((struct sys_reg_desc *)r)->val = read_sysreg(reg); \ 3067 + return ((struct sys_reg_desc *)r)->val; \ 3259 3068 } 3260 3069 3261 3070 FUNCTION_INVARIANT(midr_el1) 3262 3071 FUNCTION_INVARIANT(revidr_el1) 3263 3072 FUNCTION_INVARIANT(aidr_el1) 3264 3073 3265 - static void get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r) 3074 + static u64 get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r) 3266 3075 { 3267 3076 ((struct sys_reg_desc *)r)->val = read_sanitised_ftr_reg(SYS_CTR_EL0); 3077 + return ((struct sys_reg_desc *)r)->val; 3268 3078 } 3269 3079 3270 3080 /* ->val is filled in by kvm_sys_reg_table_init() */ ··· 3558 3364 3559 3365 int __init kvm_sys_reg_table_init(void) 3560 3366 { 3367 + struct sys_reg_params params; 3561 3368 bool valid = true; 3562 3369 unsigned int i; 3563 3370 ··· 3576 3381 /* We abuse the reset function to overwrite the table itself. */ 3577 3382 for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) 3578 3383 invariant_sys_regs[i].reset(NULL, &invariant_sys_regs[i]); 3384 + 3385 + /* Find the first idreg (SYS_ID_PFR0_EL1) in sys_reg_descs. */ 3386 + params = encoding_to_params(SYS_ID_PFR0_EL1); 3387 + first_idreg = find_reg(&params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); 3388 + if (!first_idreg) 3389 + return -EINVAL; 3579 3390 3580 3391 return 0; 3581 3392 }

+17 -5

arch/arm64/kvm/sys_regs.h

··· 27 27 bool is_write; 28 28 }; 29 29 30 + #define encoding_to_params(reg) \ 31 + ((struct sys_reg_params){ .Op0 = sys_reg_Op0(reg), \ 32 + .Op1 = sys_reg_Op1(reg), \ 33 + .CRn = sys_reg_CRn(reg), \ 34 + .CRm = sys_reg_CRm(reg), \ 35 + .Op2 = sys_reg_Op2(reg) }) 36 + 30 37 #define esr_sys64_to_params(esr) \ 31 38 ((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3, \ 32 39 .Op1 = ((esr) >> 14) & 0x7, \ ··· 71 64 struct sys_reg_params *, 72 65 const struct sys_reg_desc *); 73 66 74 - /* Initialization for vcpu. */ 75 - void (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *); 67 + /* 68 + * Initialization for vcpu. Return initialized value, or KVM 69 + * sanitized value for ID registers. 70 + */ 71 + u64 (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *); 76 72 77 73 /* Index into sys_reg[], or 0 if we don't need to save it. */ 78 74 int reg; 79 75 80 - /* Value (usually reset value) */ 76 + /* Value (usually reset value), or write mask for idregs */ 81 77 u64 val; 82 78 83 79 /* Custom get/set_user functions, fallback to generic if NULL */ ··· 133 123 } 134 124 135 125 /* Reset functions */ 136 - static inline void reset_unknown(struct kvm_vcpu *vcpu, 126 + static inline u64 reset_unknown(struct kvm_vcpu *vcpu, 137 127 const struct sys_reg_desc *r) 138 128 { 139 129 BUG_ON(!r->reg); 140 130 BUG_ON(r->reg >= NR_SYS_REGS); 141 131 __vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL; 132 + return __vcpu_sys_reg(vcpu, r->reg); 142 133 } 143 134 144 - static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 135 + static inline u64 reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) 145 136 { 146 137 BUG_ON(!r->reg); 147 138 BUG_ON(r->reg >= NR_SYS_REGS); 148 139 __vcpu_sys_reg(vcpu, r->reg) = r->val; 140 + return __vcpu_sys_reg(vcpu, r->reg); 149 141 } 150 142 151 143 static inline unsigned int sysreg_visibility(const struct kvm_vcpu *vcpu,

+4 -13

arch/arm64/mm/kasan_init.c

··· 214 214 static void __init kasan_init_shadow(void) 215 215 { 216 216 u64 kimg_shadow_start, kimg_shadow_end; 217 - u64 mod_shadow_start, mod_shadow_end; 217 + u64 mod_shadow_start; 218 218 u64 vmalloc_shadow_end; 219 219 phys_addr_t pa_start, pa_end; 220 220 u64 i; ··· 223 223 kimg_shadow_end = PAGE_ALIGN((u64)kasan_mem_to_shadow(KERNEL_END)); 224 224 225 225 mod_shadow_start = (u64)kasan_mem_to_shadow((void *)MODULES_VADDR); 226 - mod_shadow_end = (u64)kasan_mem_to_shadow((void *)MODULES_END); 227 226 228 227 vmalloc_shadow_end = (u64)kasan_mem_to_shadow((void *)VMALLOC_END); 229 228 ··· 245 246 kasan_populate_early_shadow(kasan_mem_to_shadow((void *)PAGE_END), 246 247 (void *)mod_shadow_start); 247 248 248 - if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { 249 - BUILD_BUG_ON(VMALLOC_START != MODULES_END); 250 - kasan_populate_early_shadow((void *)vmalloc_shadow_end, 251 - (void *)KASAN_SHADOW_END); 252 - } else { 253 - kasan_populate_early_shadow((void *)kimg_shadow_end, 254 - (void *)KASAN_SHADOW_END); 255 - if (kimg_shadow_start > mod_shadow_end) 256 - kasan_populate_early_shadow((void *)mod_shadow_end, 257 - (void *)kimg_shadow_start); 258 - } 249 + BUILD_BUG_ON(VMALLOC_START != MODULES_END); 250 + kasan_populate_early_shadow((void *)vmalloc_shadow_end, 251 + (void *)KASAN_SHADOW_END); 259 252 260 253 for_each_mem_range(i, &pa_start, &pa_end) { 261 254 void *start = (void *)__phys_to_virt(pa_start);

+3

arch/arm64/tools/cpucaps

··· 25 25 HAS_ECV 26 26 HAS_ECV_CNTPOFF 27 27 HAS_EPAN 28 + HAS_EVT 28 29 HAS_GENERIC_AUTH 29 30 HAS_GENERIC_AUTH_ARCH_QARMA3 30 31 HAS_GENERIC_AUTH_ARCH_QARMA5 ··· 48 47 HAS_VIRT_HOST_EXTN 49 48 HAS_WFXT 50 49 HW_DBM 50 + KVM_HVHE 51 51 KVM_PROTECTED_MODE 52 52 MISMATCHED_CACHE_TYPE 53 53 MTE ··· 79 77 WORKAROUND_2457168 80 78 WORKAROUND_2645198 81 79 WORKAROUND_2658417 80 + WORKAROUND_AMPERE_AC03_CPU_38 82 81 WORKAROUND_TRBE_OVERWRITE_FILL_MODE 83 82 WORKAROUND_TSB_FLUSH_FAILURE 84 83 WORKAROUND_TRBE_WRITE_OUT_OF_RANGE

+6 -2

include/kvm/arm_pmu.h

··· 92 92 /* 93 93 * Evaluates as true when emulating PMUv3p5, and false otherwise. 94 94 */ 95 - #define kvm_pmu_is_3p5(vcpu) \ 96 - (vcpu->kvm->arch.dfr0_pmuver.imp >= ID_AA64DFR0_EL1_PMUVer_V3P5) 95 + #define kvm_pmu_is_3p5(vcpu) ({ \ 96 + u64 val = IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1); \ 97 + u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val); \ 98 + \ 99 + pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P5; \ 100 + }) 97 101 98 102 u8 kvm_arm_pmu_get_pmuver_limit(void); 99 103

+8

include/linux/arm_ffa.h

··· 94 94 */ 95 95 #define FFA_PAGE_SIZE SZ_4K 96 96 97 + /* 98 + * Minimum buffer size/alignment encodings returned by an FFA_FEATURES 99 + * query for FFA_RXTX_MAP. 100 + */ 101 + #define FFA_FEAT_RXTX_MIN_SZ_4K 0 102 + #define FFA_FEAT_RXTX_MIN_SZ_64K 1 103 + #define FFA_FEAT_RXTX_MIN_SZ_16K 2 104 + 97 105 /* FFA Bus/Device/Driver related */ 98 106 struct ffa_device { 99 107 u32 id;

+2

include/linux/kvm_host.h

··· 991 991 return RB_EMPTY_ROOT(&slots->gfn_tree); 992 992 } 993 993 994 + bool kvm_are_all_memslots_empty(struct kvm *kvm); 995 + 994 996 #define kvm_for_each_memslot(memslot, bkt, slots) \ 995 997 hash_for_each(slots->id_hash, bkt, memslot, id_node[slots->node_idx]) \ 996 998 if (WARN_ON_ONCE(!memslot->npages)) { \

+2

include/uapi/linux/kvm.h

··· 1190 1190 #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225 1191 1191 #define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226 1192 1192 #define KVM_CAP_COUNTER_OFFSET 227 1193 + #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228 1194 + #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 1193 1195 1194 1196 #ifdef KVM_CAP_IRQ_ROUTING 1195 1197

+2 -1

virt/kvm/kvm_main.c

··· 4620 4620 return -EINVAL; 4621 4621 } 4622 4622 4623 - static bool kvm_are_all_memslots_empty(struct kvm *kvm) 4623 + bool kvm_are_all_memslots_empty(struct kvm *kvm) 4624 4624 { 4625 4625 int i; 4626 4626 ··· 4633 4633 4634 4634 return true; 4635 4635 } 4636 + EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty); 4636 4637 4637 4638 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, 4638 4639 struct kvm_enable_cap *cap)

Configure Feed

Configure Feed