Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch kvm-arm64/52bit-at into kvmarm-master/next

* kvm-arm64/52bit-at:
: .
: Upgrade the S1 page table walker to support 52bit PA, and use it to
: report the fault level when taking a S2 fault on S1PTW, which is required
: by the architecture (20250915114451.660351-1-maz@kernel.org).
: .
KVM: arm64: selftest: Expand external_aborts test to look for TTW levels
KVM: arm64: Populate level on S1PTW SEA injection
KVM: arm64: Add S1 IPA to page table level walker
KVM: arm64: Add filtering hook to S1 page table walk
KVM: arm64: Don't switch MMU on translation from non-NV context
KVM: arm64: Allow EL1 control registers to be accessed from the CPU state
KVM: arm64: Allow use of S1 PTW for non-NV vcpus
KVM: arm64: Report faults from S1 walk setup at the expected start level
KVM: arm64: Expand valid block mappings to FEAT_LPA/LPA2 support
KVM: arm64: Populate PAR_EL1 with 52bit addresses
KVM: arm64: Compute shareability for LPA2
KVM: arm64: Pass the walk_info structure to compute_par_s1()
KVM: arm64: Decouple output address from the PT descriptor
KVM: arm64: Compute 52bit TTBR address and alignment
KVM: arm64: Account for 52bit when computing maximum OA
KVM: arm64: Add helper computing the state of 52bit PA support

Signed-off-by: Marc Zyngier <maz@kernel.org>

+374 -112
+23 -2
arch/arm64/include/asm/kvm_nested.h
··· 265 265 return base; 266 266 } 267 267 268 - static inline unsigned int ps_to_output_size(unsigned int ps) 268 + static inline unsigned int ps_to_output_size(unsigned int ps, bool pa52bit) 269 269 { 270 270 switch (ps) { 271 271 case 0: return 32; ··· 273 273 case 2: return 40; 274 274 case 3: return 42; 275 275 case 4: return 44; 276 - case 5: 276 + case 5: return 48; 277 + case 6: if (pa52bit) 278 + return 52; 279 + fallthrough; 277 280 default: 278 281 return 48; 279 282 } ··· 288 285 TR_EL2, 289 286 }; 290 287 288 + struct s1_walk_info; 289 + 290 + struct s1_walk_context { 291 + struct s1_walk_info *wi; 292 + u64 table_ipa; 293 + int level; 294 + }; 295 + 296 + struct s1_walk_filter { 297 + int (*fn)(struct s1_walk_context *, void *); 298 + void *priv; 299 + }; 300 + 291 301 struct s1_walk_info { 302 + struct s1_walk_filter *filter; 292 303 u64 baddr; 293 304 enum trans_regime regime; 294 305 unsigned int max_oa_bits; 295 306 unsigned int pgshift; 296 307 unsigned int txsz; 297 308 int sl; 309 + u8 sh; 298 310 bool as_el0; 299 311 bool hpd; 300 312 bool e0poe; ··· 317 299 bool pan; 318 300 bool be; 319 301 bool s2; 302 + bool pa52bit; 320 303 }; 321 304 322 305 struct s1_walk_result { ··· 353 334 354 335 int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, 355 336 struct s1_walk_result *wr, u64 va); 337 + int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, 338 + int *level); 356 339 357 340 /* VNCR management */ 358 341 int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu);
+270 -106
arch/arm64/kvm/at.c
··· 28 28 /* Return true if the IPA is out of the OA range */ 29 29 static bool check_output_size(u64 ipa, struct s1_walk_info *wi) 30 30 { 31 + if (wi->pa52bit) 32 + return wi->max_oa_bits < 52 && (ipa & GENMASK_ULL(51, wi->max_oa_bits)); 31 33 return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits)); 34 + } 35 + 36 + static bool has_52bit_pa(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, u64 tcr) 37 + { 38 + switch (BIT(wi->pgshift)) { 39 + case SZ_64K: 40 + default: /* IMPDEF: treat any other value as 64k */ 41 + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52)) 42 + return false; 43 + return ((wi->regime == TR_EL2 ? 44 + FIELD_GET(TCR_EL2_PS_MASK, tcr) : 45 + FIELD_GET(TCR_IPS_MASK, tcr)) == 0b0110); 46 + case SZ_16K: 47 + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT)) 48 + return false; 49 + break; 50 + case SZ_4K: 51 + if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT)) 52 + return false; 53 + break; 54 + } 55 + 56 + return (tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS)); 57 + } 58 + 59 + static u64 desc_to_oa(struct s1_walk_info *wi, u64 desc) 60 + { 61 + u64 addr; 62 + 63 + if (!wi->pa52bit) 64 + return desc & GENMASK_ULL(47, wi->pgshift); 65 + 66 + switch (BIT(wi->pgshift)) { 67 + case SZ_4K: 68 + case SZ_16K: 69 + addr = desc & GENMASK_ULL(49, wi->pgshift); 70 + addr |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, desc) << 50; 71 + break; 72 + case SZ_64K: 73 + default: /* IMPDEF: treat any other value as 64k */ 74 + addr = desc & GENMASK_ULL(47, wi->pgshift); 75 + addr |= FIELD_GET(KVM_PTE_ADDR_51_48, desc) << 48; 76 + break; 77 + } 78 + 79 + return addr; 32 80 } 33 81 34 82 /* Return the translation regime that applies to an AT instruction */ ··· 98 50 } 99 51 } 100 52 53 + static u64 effective_tcr2(struct kvm_vcpu *vcpu, enum trans_regime regime) 54 + { 55 + if (regime == TR_EL10) { 56 + if (vcpu_has_nv(vcpu) && 57 + !(__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En)) 58 + return 0; 59 + 60 + return vcpu_read_sys_reg(vcpu, TCR2_EL1); 61 + } 62 + 63 + return vcpu_read_sys_reg(vcpu, TCR2_EL2); 64 + } 65 + 101 66 static bool s1pie_enabled(struct kvm_vcpu *vcpu, enum trans_regime regime) 102 67 { 103 68 if (!kvm_has_s1pie(vcpu->kvm)) 104 69 return false; 105 70 106 - switch (regime) { 107 - case TR_EL2: 108 - case TR_EL20: 109 - return vcpu_read_sys_reg(vcpu, TCR2_EL2) & TCR2_EL2_PIE; 110 - case TR_EL10: 111 - return (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) && 112 - (__vcpu_sys_reg(vcpu, TCR2_EL1) & TCR2_EL1_PIE); 113 - default: 114 - BUG(); 115 - } 71 + /* Abuse TCR2_EL1_PIE and use it for EL2 as well */ 72 + return effective_tcr2(vcpu, regime) & TCR2_EL1_PIE; 116 73 } 117 74 118 75 static void compute_s1poe(struct kvm_vcpu *vcpu, struct s1_walk_info *wi) ··· 129 76 return; 130 77 } 131 78 132 - switch (wi->regime) { 133 - case TR_EL2: 134 - case TR_EL20: 135 - val = vcpu_read_sys_reg(vcpu, TCR2_EL2); 136 - wi->poe = val & TCR2_EL2_POE; 137 - wi->e0poe = (wi->regime == TR_EL20) && (val & TCR2_EL2_E0POE); 138 - break; 139 - case TR_EL10: 140 - if (__vcpu_sys_reg(vcpu, HCRX_EL2) & HCRX_EL2_TCR2En) { 141 - wi->poe = wi->e0poe = false; 142 - return; 143 - } 79 + val = effective_tcr2(vcpu, wi->regime); 144 80 145 - val = __vcpu_sys_reg(vcpu, TCR2_EL1); 146 - wi->poe = val & TCR2_EL1_POE; 147 - wi->e0poe = val & TCR2_EL1_E0POE; 148 - } 81 + /* Abuse TCR2_EL1_* for EL2 */ 82 + wi->poe = val & TCR2_EL1_POE; 83 + wi->e0poe = (wi->regime != TR_EL2) && (val & TCR2_EL1_E0POE); 149 84 } 150 85 151 86 static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, ··· 143 102 unsigned int stride, x; 144 103 bool va55, tbi, lva; 145 104 146 - hcr = __vcpu_sys_reg(vcpu, HCR_EL2); 147 - 148 105 va55 = va & BIT(55); 149 106 150 - if (wi->regime == TR_EL2 && va55) 151 - goto addrsz; 152 - 153 - wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); 107 + if (vcpu_has_nv(vcpu)) { 108 + hcr = __vcpu_sys_reg(vcpu, HCR_EL2); 109 + wi->s2 = wi->regime == TR_EL10 && (hcr & (HCR_VM | HCR_DC)); 110 + } else { 111 + WARN_ON_ONCE(wi->regime != TR_EL10); 112 + wi->s2 = false; 113 + hcr = 0; 114 + } 154 115 155 116 switch (wi->regime) { 156 117 case TR_EL10: ··· 174 131 BUG(); 175 132 } 176 133 134 + /* Someone was silly enough to encode TG0/TG1 differently */ 135 + if (va55 && wi->regime != TR_EL2) { 136 + wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); 137 + tg = FIELD_GET(TCR_TG1_MASK, tcr); 138 + 139 + switch (tg << TCR_TG1_SHIFT) { 140 + case TCR_TG1_4K: 141 + wi->pgshift = 12; break; 142 + case TCR_TG1_16K: 143 + wi->pgshift = 14; break; 144 + case TCR_TG1_64K: 145 + default: /* IMPDEF: treat any other value as 64k */ 146 + wi->pgshift = 16; break; 147 + } 148 + } else { 149 + wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); 150 + tg = FIELD_GET(TCR_TG0_MASK, tcr); 151 + 152 + switch (tg << TCR_TG0_SHIFT) { 153 + case TCR_TG0_4K: 154 + wi->pgshift = 12; break; 155 + case TCR_TG0_16K: 156 + wi->pgshift = 14; break; 157 + case TCR_TG0_64K: 158 + default: /* IMPDEF: treat any other value as 64k */ 159 + wi->pgshift = 16; break; 160 + } 161 + } 162 + 163 + wi->pa52bit = has_52bit_pa(vcpu, wi, tcr); 164 + 165 + ia_bits = get_ia_size(wi); 166 + 167 + /* AArch64.S1StartLevel() */ 168 + stride = wi->pgshift - 3; 169 + wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); 170 + 171 + if (wi->regime == TR_EL2 && va55) 172 + goto addrsz; 173 + 177 174 tbi = (wi->regime == TR_EL2 ? 178 175 FIELD_GET(TCR_EL2_TBI, tcr) : 179 176 (va55 ? ··· 222 139 223 140 if (!tbi && (u64)sign_extend64(va, 55) != va) 224 141 goto addrsz; 142 + 143 + wi->sh = (wi->regime == TR_EL2 ? 144 + FIELD_GET(TCR_EL2_SH0_MASK, tcr) : 145 + (va55 ? 146 + FIELD_GET(TCR_SH1_MASK, tcr) : 147 + FIELD_GET(TCR_SH0_MASK, tcr))); 225 148 226 149 va = (u64)sign_extend64(va, 55); 227 150 ··· 283 194 /* R_BVXDG */ 284 195 wi->hpd |= (wi->poe || wi->e0poe); 285 196 286 - /* Someone was silly enough to encode TG0/TG1 differently */ 287 - if (va55) { 288 - wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); 289 - tg = FIELD_GET(TCR_TG1_MASK, tcr); 290 - 291 - switch (tg << TCR_TG1_SHIFT) { 292 - case TCR_TG1_4K: 293 - wi->pgshift = 12; break; 294 - case TCR_TG1_16K: 295 - wi->pgshift = 14; break; 296 - case TCR_TG1_64K: 297 - default: /* IMPDEF: treat any other value as 64k */ 298 - wi->pgshift = 16; break; 299 - } 300 - } else { 301 - wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); 302 - tg = FIELD_GET(TCR_TG0_MASK, tcr); 303 - 304 - switch (tg << TCR_TG0_SHIFT) { 305 - case TCR_TG0_4K: 306 - wi->pgshift = 12; break; 307 - case TCR_TG0_16K: 308 - wi->pgshift = 14; break; 309 - case TCR_TG0_64K: 310 - default: /* IMPDEF: treat any other value as 64k */ 311 - wi->pgshift = 16; break; 312 - } 313 - } 314 - 315 197 /* R_PLCGL, R_YXNYW */ 316 198 if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) { 317 199 if (wi->txsz > 39) 318 - goto transfault_l0; 200 + goto transfault; 319 201 } else { 320 202 if (wi->txsz > 48 || (BIT(wi->pgshift) == SZ_64K && wi->txsz > 47)) 321 - goto transfault_l0; 203 + goto transfault; 322 204 } 323 205 324 206 /* R_GTJBY, R_SXWGM */ 325 207 switch (BIT(wi->pgshift)) { 326 208 case SZ_4K: 327 - lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT); 328 - lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); 329 - break; 330 209 case SZ_16K: 331 - lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT); 332 - lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); 210 + lva = wi->pa52bit; 333 211 break; 334 212 case SZ_64K: 335 213 lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52); ··· 304 248 } 305 249 306 250 if ((lva && wi->txsz < 12) || (!lva && wi->txsz < 16)) 307 - goto transfault_l0; 308 - 309 - ia_bits = get_ia_size(wi); 251 + goto transfault; 310 252 311 253 /* R_YYVYV, I_THCZK */ 312 254 if ((!va55 && va > GENMASK(ia_bits - 1, 0)) || 313 255 (va55 && va < GENMASK(63, ia_bits))) 314 - goto transfault_l0; 256 + goto transfault; 315 257 316 258 /* I_ZFSYQ */ 317 259 if (wi->regime != TR_EL2 && 318 260 (tcr & (va55 ? TCR_EPD1_MASK : TCR_EPD0_MASK))) 319 - goto transfault_l0; 261 + goto transfault; 320 262 321 263 /* R_BNDVG and following statements */ 322 264 if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && 323 265 wi->as_el0 && (tcr & (va55 ? TCR_E0PD1 : TCR_E0PD0))) 324 - goto transfault_l0; 325 - 326 - /* AArch64.S1StartLevel() */ 327 - stride = wi->pgshift - 3; 328 - wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); 266 + goto transfault; 329 267 330 268 ps = (wi->regime == TR_EL2 ? 331 269 FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); 332 270 333 - wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps)); 271 + wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps, wi->pa52bit)); 334 272 335 273 /* Compute minimal alignment */ 336 274 x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); 337 275 338 276 wi->baddr = ttbr & TTBRx_EL1_BADDR; 277 + if (wi->pa52bit) { 278 + /* 279 + * Force the alignment on 64 bytes for top-level tables 280 + * smaller than 8 entries, since TTBR.BADDR[5:2] are used to 281 + * store bits [51:48] of the first level of lookup. 282 + */ 283 + x = max(x, 6); 284 + 285 + wi->baddr |= FIELD_GET(GENMASK_ULL(5, 2), ttbr) << 48; 286 + } 339 287 340 288 /* R_VPBBF */ 341 289 if (check_output_size(wi->baddr, wi)) ··· 349 289 350 290 return 0; 351 291 352 - addrsz: /* Address Size Fault level 0 */ 292 + addrsz: 293 + /* 294 + * Address Size Fault level 0 to indicate it comes from TTBR. 295 + * yes, this is an oddity. 296 + */ 353 297 fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false); 354 298 return -EFAULT; 355 299 356 - transfault_l0: /* Translation Fault level 0 */ 357 - fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false); 300 + transfault: 301 + /* Translation Fault on start level */ 302 + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(wi->sl), false); 358 303 return -EFAULT; 359 304 } 360 305 ··· 404 339 ipa = kvm_s2_trans_output(&s2_trans); 405 340 } 406 341 342 + if (wi->filter) { 343 + ret = wi->filter->fn(&(struct s1_walk_context) 344 + { 345 + .wi = wi, 346 + .table_ipa = baddr, 347 + .level = level, 348 + }, wi->filter->priv); 349 + if (ret) 350 + return ret; 351 + } 352 + 407 353 ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc)); 408 354 if (ret) { 409 355 fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false); ··· 445 369 wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc); 446 370 } 447 371 448 - baddr = desc & GENMASK_ULL(47, wi->pgshift); 372 + baddr = desc_to_oa(wi, desc); 449 373 450 374 /* Check for out-of-range OA */ 451 375 if (check_output_size(baddr, wi)) ··· 462 386 463 387 switch (BIT(wi->pgshift)) { 464 388 case SZ_4K: 465 - valid_block = level == 1 || level == 2; 389 + valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0); 466 390 break; 467 391 case SZ_16K: 468 392 case SZ_64K: 469 - valid_block = level == 2; 393 + valid_block = level == 2 || (wi->pa52bit && level == 1); 470 394 break; 471 395 } 472 396 ··· 474 398 goto transfault; 475 399 } 476 400 477 - if (check_output_size(desc & GENMASK(47, va_bottom), wi)) 401 + baddr = desc_to_oa(wi, desc); 402 + if (check_output_size(baddr & GENMASK(52, va_bottom), wi)) 478 403 goto addrsz; 479 404 480 405 if (!(desc & PTE_AF)) { ··· 488 411 wr->failed = false; 489 412 wr->level = level; 490 413 wr->desc = desc; 491 - wr->pa = desc & GENMASK(47, va_bottom); 414 + wr->pa = baddr & GENMASK(52, va_bottom); 492 415 wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); 493 416 494 417 wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG); ··· 717 640 #define ATTR_OSH 0b10 718 641 #define ATTR_ISH 0b11 719 642 720 - static u8 compute_sh(u8 attr, u64 desc) 643 + static u8 compute_final_sh(u8 attr, u8 sh) 721 644 { 722 - u8 sh; 723 - 724 645 /* Any form of device, as well as NC has SH[1:0]=0b10 */ 725 646 if (MEMATTR_IS_DEVICE(attr) || attr == MEMATTR(NC, NC)) 726 647 return ATTR_OSH; 727 648 728 - sh = FIELD_GET(PTE_SHARED, desc); 729 649 if (sh == ATTR_RSV) /* Reserved, mapped to NSH */ 730 650 sh = ATTR_NSH; 731 651 732 652 return sh; 653 + } 654 + 655 + static u8 compute_s1_sh(struct s1_walk_info *wi, struct s1_walk_result *wr, 656 + u8 attr) 657 + { 658 + u8 sh; 659 + 660 + /* 661 + * non-52bit and LPA have their basic shareability described in the 662 + * descriptor. LPA2 gets it from the corresponding field in TCR, 663 + * conveniently recorded in the walk info. 664 + */ 665 + if (!wi->pa52bit || BIT(wi->pgshift) == SZ_64K) 666 + sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_SH, wr->desc); 667 + else 668 + sh = wi->sh; 669 + 670 + return compute_final_sh(attr, sh); 733 671 } 734 672 735 673 static u8 combine_sh(u8 s1_sh, u8 s2_sh) ··· 760 668 static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, 761 669 struct kvm_s2_trans *tr) 762 670 { 763 - u8 s1_parattr, s2_memattr, final_attr; 671 + u8 s1_parattr, s2_memattr, final_attr, s2_sh; 764 672 u64 par; 765 673 766 674 /* If S2 has failed to translate, report the damage */ ··· 833 741 !MEMATTR_IS_DEVICE(final_attr)) 834 742 final_attr = MEMATTR(NC, NC); 835 743 744 + s2_sh = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_SH, tr->desc); 745 + 836 746 par = FIELD_PREP(SYS_PAR_EL1_ATTR, final_attr); 837 747 par |= tr->output & GENMASK(47, 12); 838 748 par |= FIELD_PREP(SYS_PAR_EL1_SH, 839 749 combine_sh(FIELD_GET(SYS_PAR_EL1_SH, s1_par), 840 - compute_sh(final_attr, tr->desc))); 750 + compute_final_sh(final_attr, s2_sh))); 841 751 842 752 return par; 843 753 } 844 754 845 - static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr, 846 - enum trans_regime regime) 755 + static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, 756 + struct s1_walk_result *wr) 847 757 { 848 758 u64 par; 849 759 ··· 858 764 } else if (wr->level == S1_MMU_DISABLED) { 859 765 /* MMU off or HCR_EL2.DC == 1 */ 860 766 par = SYS_PAR_EL1_NSE; 861 - par |= wr->pa & GENMASK_ULL(47, 12); 767 + par |= wr->pa & SYS_PAR_EL1_PA; 862 768 863 - if (regime == TR_EL10 && 769 + if (wi->regime == TR_EL10 && vcpu_has_nv(vcpu) && 864 770 (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { 865 771 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 866 772 MEMATTR(WbRaWa, WbRaWa)); ··· 875 781 876 782 par = SYS_PAR_EL1_NSE; 877 783 878 - mair = (regime == TR_EL10 ? 784 + mair = (wi->regime == TR_EL10 ? 879 785 vcpu_read_sys_reg(vcpu, MAIR_EL1) : 880 786 vcpu_read_sys_reg(vcpu, MAIR_EL2)); 881 787 882 788 mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8; 883 789 mair &= 0xff; 884 790 885 - sctlr = (regime == TR_EL10 ? 791 + sctlr = (wi->regime == TR_EL10 ? 886 792 vcpu_read_sys_reg(vcpu, SCTLR_EL1) : 887 793 vcpu_read_sys_reg(vcpu, SCTLR_EL2)); 888 794 ··· 891 797 mair = MEMATTR(NC, NC); 892 798 893 799 par |= FIELD_PREP(SYS_PAR_EL1_ATTR, mair); 894 - par |= wr->pa & GENMASK_ULL(47, 12); 800 + par |= wr->pa & SYS_PAR_EL1_PA; 895 801 896 - sh = compute_sh(mair, wr->desc); 802 + sh = compute_s1_sh(wi, wr, mair); 897 803 par |= FIELD_PREP(SYS_PAR_EL1_SH, sh); 898 804 } 899 805 ··· 967 873 wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_WXN); 968 874 break; 969 875 case TR_EL10: 970 - wxn = (__vcpu_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN); 876 + wxn = (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_WXN); 971 877 break; 972 878 } 973 879 ··· 1280 1186 fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false); 1281 1187 1282 1188 compute_par: 1283 - return compute_par_s1(vcpu, &wr, wi.regime); 1189 + return compute_par_s1(vcpu, &wi, &wr); 1284 1190 } 1285 1191 1286 1192 /* ··· 1296 1202 { 1297 1203 struct mmu_config config; 1298 1204 struct kvm_s2_mmu *mmu; 1299 - bool fail; 1205 + bool fail, mmu_cs; 1300 1206 u64 par; 1301 1207 1302 1208 par = SYS_PAR_EL1_F; ··· 1312 1218 * If HCR_EL2.{E2H,TGE} == {1,1}, the MMU context is already 1313 1219 * the right one (as we trapped from vEL2). If not, save the 1314 1220 * full MMU context. 1221 + * 1222 + * We are also guaranteed to be in the correct context if 1223 + * we're not in a nested VM. 1315 1224 */ 1316 - if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) 1225 + mmu_cs = (vcpu_has_nv(vcpu) && 1226 + !(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))); 1227 + if (!mmu_cs) 1317 1228 goto skip_mmu_switch; 1318 1229 1319 1230 /* ··· 1386 1287 1387 1288 write_sysreg_hcr(HCR_HOST_VHE_FLAGS); 1388 1289 1389 - if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu))) 1290 + if (mmu_cs) 1390 1291 __mmu_config_restore(&config); 1391 1292 1392 1293 return par; ··· 1568 1469 } 1569 1470 1570 1471 return 0; 1472 + } 1473 + 1474 + struct desc_match { 1475 + u64 ipa; 1476 + int level; 1477 + }; 1478 + 1479 + static int match_s1_desc(struct s1_walk_context *ctxt, void *priv) 1480 + { 1481 + struct desc_match *dm = priv; 1482 + u64 ipa = dm->ipa; 1483 + 1484 + /* Use S1 granule alignment */ 1485 + ipa &= GENMASK(51, ctxt->wi->pgshift); 1486 + 1487 + /* Not the IPA we're looking for? Continue. */ 1488 + if (ipa != ctxt->table_ipa) 1489 + return 0; 1490 + 1491 + /* Note the level and interrupt the walk */ 1492 + dm->level = ctxt->level; 1493 + return -EINTR; 1494 + } 1495 + 1496 + int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level) 1497 + { 1498 + struct desc_match dm = { 1499 + .ipa = ipa, 1500 + }; 1501 + struct s1_walk_info wi = { 1502 + .filter = &(struct s1_walk_filter){ 1503 + .fn = match_s1_desc, 1504 + .priv = &dm, 1505 + }, 1506 + .regime = TR_EL10, 1507 + .as_el0 = false, 1508 + .pan = false, 1509 + }; 1510 + struct s1_walk_result wr = {}; 1511 + int ret; 1512 + 1513 + ret = setup_s1_walk(vcpu, &wi, &wr, va); 1514 + if (ret) 1515 + return ret; 1516 + 1517 + /* We really expect the S1 MMU to be on here... */ 1518 + if (WARN_ON_ONCE(wr.level == S1_MMU_DISABLED)) { 1519 + *level = 0; 1520 + return 0; 1521 + } 1522 + 1523 + /* Walk the guest's PT, looking for a match along the way */ 1524 + ret = walk_s1(vcpu, &wi, &wr, va); 1525 + switch (ret) { 1526 + case -EINTR: 1527 + /* We interrupted the walk on a match, return the level */ 1528 + *level = dm.level; 1529 + return 0; 1530 + case 0: 1531 + /* The walk completed, we failed to find the entry */ 1532 + return -ENOENT; 1533 + default: 1534 + /* Any other error... */ 1535 + return ret; 1536 + } 1571 1537 }
+25 -2
arch/arm64/kvm/inject_fault.c
··· 106 106 { 107 107 unsigned long cpsr = *vcpu_cpsr(vcpu); 108 108 bool is_aarch32 = vcpu_mode_is_32bit(vcpu); 109 - u64 esr = 0; 109 + u64 esr = 0, fsc; 110 + int level; 111 + 112 + /* 113 + * If injecting an abort from a failed S1PTW, rewalk the S1 PTs to 114 + * find the failing level. If we can't find it, assume the error was 115 + * transient and restart without changing the state. 116 + */ 117 + if (kvm_vcpu_abt_iss1tw(vcpu)) { 118 + u64 hpfar = kvm_vcpu_get_fault_ipa(vcpu); 119 + int ret; 120 + 121 + if (hpfar == INVALID_GPA) 122 + return; 123 + 124 + ret = __kvm_find_s1_desc_level(vcpu, addr, hpfar, &level); 125 + if (ret) 126 + return; 127 + 128 + WARN_ON_ONCE(level < -1 || level > 3); 129 + fsc = ESR_ELx_FSC_SEA_TTW(level); 130 + } else { 131 + fsc = ESR_ELx_FSC_EXTABT; 132 + } 110 133 111 134 /* This delight is brought to you by FEAT_DoubleFault2. */ 112 135 if (effective_sctlr2_ease(vcpu)) ··· 156 133 if (!is_iabt) 157 134 esr |= ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT; 158 135 159 - esr |= ESR_ELx_FSC_EXTABT; 136 + esr |= fsc; 160 137 161 138 vcpu_write_sys_reg(vcpu, addr, exception_far_elx(vcpu)); 162 139 vcpu_write_sys_reg(vcpu, esr, exception_esr_elx(vcpu));
+1 -1
arch/arm64/kvm/nested.c
··· 349 349 wi->sl = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); 350 350 /* Global limit for now, should eventually be per-VM */ 351 351 wi->max_oa_bits = min(get_kvm_ipa_limit(), 352 - ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr))); 352 + ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr), false)); 353 353 } 354 354 355 355 int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
+42
tools/testing/selftests/kvm/arm64/external_aborts.c
··· 250 250 kvm_vm_free(vm); 251 251 } 252 252 253 + static void expect_sea_s1ptw_handler(struct ex_regs *regs) 254 + { 255 + u64 esr = read_sysreg(esr_el1); 256 + 257 + GUEST_ASSERT_EQ(regs->pc, expected_abort_pc); 258 + GUEST_ASSERT_EQ(ESR_ELx_EC(esr), ESR_ELx_EC_DABT_CUR); 259 + GUEST_ASSERT_EQ((esr & ESR_ELx_FSC), ESR_ELx_FSC_SEA_TTW(3)); 260 + 261 + GUEST_DONE(); 262 + } 263 + 264 + static noinline void test_s1ptw_abort_guest(void) 265 + { 266 + extern char test_s1ptw_abort_insn; 267 + 268 + WRITE_ONCE(expected_abort_pc, (u64)&test_s1ptw_abort_insn); 269 + 270 + asm volatile("test_s1ptw_abort_insn:\n\t" 271 + "ldr x0, [%0]\n\t" 272 + : : "r" (MMIO_ADDR) : "x0", "memory"); 273 + 274 + GUEST_FAIL("Load on S1PTW abort should not retire"); 275 + } 276 + 277 + static void test_s1ptw_abort(void) 278 + { 279 + struct kvm_vcpu *vcpu; 280 + u64 *ptep, bad_pa; 281 + struct kvm_vm *vm = vm_create_with_dabt_handler(&vcpu, test_s1ptw_abort_guest, 282 + expect_sea_s1ptw_handler); 283 + 284 + ptep = virt_get_pte_hva_at_level(vm, MMIO_ADDR, 2); 285 + bad_pa = BIT(vm->pa_bits) - vm->page_size; 286 + 287 + *ptep &= ~GENMASK(47, 12); 288 + *ptep |= bad_pa; 289 + 290 + vcpu_run_expect_done(vcpu); 291 + kvm_vm_free(vm); 292 + } 293 + 253 294 static void test_serror_emulated_guest(void) 254 295 { 255 296 GUEST_ASSERT(!(read_sysreg(isr_el1) & ISR_EL1_A)); ··· 368 327 test_serror_masked(); 369 328 test_serror_emulated(); 370 329 test_mmio_ease(); 330 + test_s1ptw_abort(); 371 331 }
+1
tools/testing/selftests/kvm/include/arm64/processor.h
··· 175 175 void vm_install_sync_handler(struct kvm_vm *vm, 176 176 int vector, int ec, handler_fn handler); 177 177 178 + uint64_t *virt_get_pte_hva_at_level(struct kvm_vm *vm, vm_vaddr_t gva, int level); 178 179 uint64_t *virt_get_pte_hva(struct kvm_vm *vm, vm_vaddr_t gva); 179 180 180 181 static inline void cpu_relax(void)
+12 -1
tools/testing/selftests/kvm/lib/arm64/processor.c
··· 185 185 _virt_pg_map(vm, vaddr, paddr, attr_idx); 186 186 } 187 187 188 - uint64_t *virt_get_pte_hva(struct kvm_vm *vm, vm_vaddr_t gva) 188 + uint64_t *virt_get_pte_hva_at_level(struct kvm_vm *vm, vm_vaddr_t gva, int level) 189 189 { 190 190 uint64_t *ptep; 191 191 ··· 195 195 ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, gva) * 8; 196 196 if (!ptep) 197 197 goto unmapped_gva; 198 + if (level == 0) 199 + return ptep; 198 200 199 201 switch (vm->pgtable_levels) { 200 202 case 4: 201 203 ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, gva) * 8; 202 204 if (!ptep) 203 205 goto unmapped_gva; 206 + if (level == 1) 207 + break; 204 208 /* fall through */ 205 209 case 3: 206 210 ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, gva) * 8; 207 211 if (!ptep) 208 212 goto unmapped_gva; 213 + if (level == 2) 214 + break; 209 215 /* fall through */ 210 216 case 2: 211 217 ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, gva) * 8; ··· 227 221 unmapped_gva: 228 222 TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva); 229 223 exit(EXIT_FAILURE); 224 + } 225 + 226 + uint64_t *virt_get_pte_hva(struct kvm_vm *vm, vm_vaddr_t gva) 227 + { 228 + return virt_get_pte_hva_at_level(vm, gva, 3); 230 229 } 231 230 232 231 vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)