Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

KVM: arm64: gic-v5: Sanitize ID_AA64PFR2_EL1.GCIE

Add in a sanitization function for ID_AA64PFR2_EL1, preserving the
already-present behaviour for the FPMR, MTEFAR, and MTESTOREONLY
fields. Add sanitisation for the GCIE field, which is set to IMP if
the host supports a GICv5 guest and NI, otherwise.

Extend the sanitisation that takes place in kvm_vgic_create() to zero
the ID_AA64PFR2.GCIE field when a non-GICv5 GIC is created. More
importantly, move this sanitisation to a separate function,
kvm_vgic_finalize_sysregs(), and call it from kvm_finalize_sys_regs().

We are required to finalize the GIC and GCIE fields a second time in
kvm_finalize_sys_regs() due to how QEMU blindly reads out then
verbatim restores the system register state. This avoids the issue
where both the GCIE and GIC features are marked as present (an
architecturally invalid combination), and hence guests fall over. See
the comment in kvm_finalize_sys_regs() for more details.

Overall, the following happens:

* Before an irqchip is created, FEAT_GCIE is presented if the host
supports GICv5-based guests.
* Once an irqchip is created, all other supported irqchips are hidden
from the guest; system register state reflects the guest's irqchip.
* Userspace is allowed to set invalid irqchip feature combinations in
the system registers, but...
* ...invalid combinations are removed a second time prior to the first
run of the guest, and things hopefully just work.

All of this extra work is required to make sure that "legacy" GICv3
guests based on QEMU transparently work on compatible GICv5 hosts
without modification.

Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20260319154937.3619520-13-sascha.bischoff@arm.com
Signed-off-by: Marc Zyngier <maz@kernel.org>

authored by

Sascha Bischoff and committed by
Marc Zyngier
a258a383 f6568071

+98 -22
+62 -8
arch/arm64/kvm/sys_regs.c
··· 1758 1758 1759 1759 static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val); 1760 1760 static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu, u64 val); 1761 + static u64 sanitise_id_aa64pfr2_el1(const struct kvm_vcpu *vcpu, u64 val); 1761 1762 static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val); 1762 1763 1763 1764 /* Read a sanitised cpufeature ID register by sys_reg_desc */ ··· 1784 1783 val = sanitise_id_aa64pfr1_el1(vcpu, val); 1785 1784 break; 1786 1785 case SYS_ID_AA64PFR2_EL1: 1787 - val &= ID_AA64PFR2_EL1_FPMR | 1788 - (kvm_has_mte(vcpu->kvm) ? 1789 - ID_AA64PFR2_EL1_MTEFAR | ID_AA64PFR2_EL1_MTESTOREONLY : 1790 - 0); 1786 + val = sanitise_id_aa64pfr2_el1(vcpu, val); 1791 1787 break; 1792 1788 case SYS_ID_AA64ISAR1_EL1: 1793 1789 if (!vcpu_has_ptrauth(vcpu)) ··· 2025 2027 return val; 2026 2028 } 2027 2029 2030 + static u64 sanitise_id_aa64pfr2_el1(const struct kvm_vcpu *vcpu, u64 val) 2031 + { 2032 + val &= ID_AA64PFR2_EL1_FPMR | 2033 + ID_AA64PFR2_EL1_MTEFAR | 2034 + ID_AA64PFR2_EL1_MTESTOREONLY; 2035 + 2036 + if (!kvm_has_mte(vcpu->kvm)) { 2037 + val &= ~ID_AA64PFR2_EL1_MTEFAR; 2038 + val &= ~ID_AA64PFR2_EL1_MTESTOREONLY; 2039 + } 2040 + 2041 + if (vgic_host_has_gicv5()) 2042 + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR2_EL1, GCIE, IMP); 2043 + 2044 + return val; 2045 + } 2046 + 2028 2047 static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) 2029 2048 { 2030 2049 val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8); ··· 2228 2213 user_val |= hw_val & ID_AA64PFR1_EL1_MTE_frac_MASK; 2229 2214 } 2230 2215 2216 + return set_id_reg(vcpu, rd, user_val); 2217 + } 2218 + 2219 + static int set_id_aa64pfr2_el1(struct kvm_vcpu *vcpu, 2220 + const struct sys_reg_desc *rd, u64 user_val) 2221 + { 2231 2222 return set_id_reg(vcpu, rd, user_val); 2232 2223 } 2233 2224 ··· 3218 3197 ID_AA64PFR1_EL1_RES0 | 3219 3198 ID_AA64PFR1_EL1_MPAM_frac | 3220 3199 ID_AA64PFR1_EL1_MTE)), 3221 - ID_WRITABLE(ID_AA64PFR2_EL1, 3222 - ID_AA64PFR2_EL1_FPMR | 3223 - ID_AA64PFR2_EL1_MTEFAR | 3224 - ID_AA64PFR2_EL1_MTESTOREONLY), 3200 + ID_FILTERED(ID_AA64PFR2_EL1, id_aa64pfr2_el1, 3201 + ~(ID_AA64PFR2_EL1_FPMR | 3202 + ID_AA64PFR2_EL1_MTEFAR | 3203 + ID_AA64PFR2_EL1_MTESTOREONLY | 3204 + ID_AA64PFR2_EL1_GCIE)), 3225 3205 ID_UNALLOCATED(4,3), 3226 3206 ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0), 3227 3207 ID_HIDDEN(ID_AA64SMFR0_EL1), ··· 5693 5671 5694 5672 val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; 5695 5673 kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val); 5674 + val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR2_EL1) & ~ID_AA64PFR2_EL1_GCIE; 5675 + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR2_EL1, val); 5696 5676 val = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; 5697 5677 kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, val); 5678 + } else { 5679 + /* 5680 + * Certain userspace software - QEMU - samples the system 5681 + * register state without creating an irqchip, then blindly 5682 + * restores the state prior to running the final guest. This 5683 + * means that it restores the virtualization & emulation 5684 + * capabilities of the host system, rather than something that 5685 + * reflects the final guest state. Moreover, it checks that the 5686 + * state was "correctly" restored (i.e., verbatim), bailing if 5687 + * it isn't, so masking off invalid state isn't an option. 5688 + * 5689 + * On GICv5 hardware that supports FEAT_GCIE_LEGACY we can run 5690 + * both GICv3- and GICv5-based guests. Therefore, we initially 5691 + * present both ID_AA64PFR0.GIC and ID_AA64PFR2.GCIE as IMP to 5692 + * reflect that userspace can create EITHER a vGICv3 or a 5693 + * vGICv5. This is an architecturally invalid combination, of 5694 + * course. Once an in-kernel GIC is created, the sysreg state is 5695 + * updated to reflect the actual, valid configuration. 5696 + * 5697 + * Setting both the GIC and GCIE features to IMP unsurprisingly 5698 + * results in guests falling over, and hence we need to fix up 5699 + * this mess in KVM. Before running for the first time we yet 5700 + * again ensure that the GIC and GCIE fields accurately reflect 5701 + * the actual hardware the guest should see. 5702 + * 5703 + * This hack allows legacy QEMU-based GICv3 guests to run 5704 + * unmodified on compatible GICv5 hosts, and avoids the inverse 5705 + * problem for GICv5-based guests in the future. 5706 + */ 5707 + kvm_vgic_finalize_idregs(kvm); 5698 5708 } 5699 5709 5700 5710 if (vcpu_has_nv(vcpu)) {
+35 -14
arch/arm64/kvm/vgic/vgic-init.c
··· 71 71 int kvm_vgic_create(struct kvm *kvm, u32 type) 72 72 { 73 73 struct kvm_vcpu *vcpu; 74 - u64 aa64pfr0, pfr1; 75 74 unsigned long i; 76 75 int ret; 77 76 ··· 144 145 kvm->arch.vgic.implementation_rev = KVM_VGIC_IMP_REV_LATEST; 145 146 kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; 146 147 147 - aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; 148 - pfr1 = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; 149 - 150 - if (type == KVM_DEV_TYPE_ARM_VGIC_V2) { 151 - kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; 152 - } else { 153 - INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions); 154 - aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP); 155 - pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC, GICv3); 156 - } 157 - 158 - kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0); 159 - kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1); 148 + /* 149 + * We've now created the GIC. Update the system register state 150 + * to accurately reflect what we've created. 151 + */ 152 + kvm_vgic_finalize_idregs(kvm); 160 153 161 154 kvm_for_each_vcpu(i, vcpu, kvm) { 162 155 ret = vgic_allocate_private_irqs_locked(vcpu, type); ··· 606 615 mutex_unlock(&kvm->slots_lock); 607 616 608 617 return ret; 618 + } 619 + 620 + void kvm_vgic_finalize_idregs(struct kvm *kvm) 621 + { 622 + u32 type = kvm->arch.vgic.vgic_model; 623 + u64 aa64pfr0, aa64pfr2, pfr1; 624 + 625 + aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; 626 + aa64pfr2 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR2_EL1) & ~ID_AA64PFR2_EL1_GCIE; 627 + pfr1 = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC; 628 + 629 + switch (type) { 630 + case KVM_DEV_TYPE_ARM_VGIC_V2: 631 + kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; 632 + break; 633 + case KVM_DEV_TYPE_ARM_VGIC_V3: 634 + INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions); 635 + aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP); 636 + pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC, GICv3); 637 + break; 638 + case KVM_DEV_TYPE_ARM_VGIC_V5: 639 + aa64pfr2 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR2_EL1, GCIE, IMP); 640 + break; 641 + default: 642 + WARN_ONCE(1, "Unknown VGIC type!!!\n"); 643 + } 644 + 645 + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0); 646 + kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR2_EL1, aa64pfr2); 647 + kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1); 609 648 } 610 649 611 650 /* GENERIC PROBE */
+1
include/kvm/arm_vgic.h
··· 485 485 void kvm_vgic_destroy(struct kvm *kvm); 486 486 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu); 487 487 int kvm_vgic_map_resources(struct kvm *kvm); 488 + void kvm_vgic_finalize_idregs(struct kvm *kvm); 488 489 int kvm_vgic_hyp_init(void); 489 490 void kvm_vgic_init_cpu_hardware(void); 490 491