Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'x86-urgent-2022-06-19' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Thomas Gleixner:

- Make RESERVE_BRK() work again with older binutils. The recent
'simplification' broke that.

- Make early #VE handling increment RIP when successful.

- Make the #VE code consistent vs. the RIP adjustments and add
comments.

- Handle load_unaligned_zeropad() across page boundaries correctly in
#VE when the second page is shared.

* tag 'x86-urgent-2022-06-19' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/tdx: Handle load_unaligned_zeropad() page-cross to a shared page
x86/tdx: Clarify RIP adjustments in #VE handler
x86/tdx: Fix early #VE handling
x86/mm: Fix RESERVE_BRK() for older binutils

+159 -75
+136 -51
arch/x86/coco/tdx/tdx.c
··· 124 124 return BIT_ULL(gpa_width - 1); 125 125 } 126 126 127 + /* 128 + * The TDX module spec states that #VE may be injected for a limited set of 129 + * reasons: 130 + * 131 + * - Emulation of the architectural #VE injection on EPT violation; 132 + * 133 + * - As a result of guest TD execution of a disallowed instruction, 134 + * a disallowed MSR access, or CPUID virtualization; 135 + * 136 + * - A notification to the guest TD about anomalous behavior; 137 + * 138 + * The last one is opt-in and is not used by the kernel. 139 + * 140 + * The Intel Software Developer's Manual describes cases when instruction 141 + * length field can be used in section "Information for VM Exits Due to 142 + * Instruction Execution". 143 + * 144 + * For TDX, it ultimately means GET_VEINFO provides reliable instruction length 145 + * information if #VE occurred due to instruction execution, but not for EPT 146 + * violations. 147 + */ 148 + static int ve_instr_len(struct ve_info *ve) 149 + { 150 + switch (ve->exit_reason) { 151 + case EXIT_REASON_HLT: 152 + case EXIT_REASON_MSR_READ: 153 + case EXIT_REASON_MSR_WRITE: 154 + case EXIT_REASON_CPUID: 155 + case EXIT_REASON_IO_INSTRUCTION: 156 + /* It is safe to use ve->instr_len for #VE due instructions */ 157 + return ve->instr_len; 158 + case EXIT_REASON_EPT_VIOLATION: 159 + /* 160 + * For EPT violations, ve->insn_len is not defined. For those, 161 + * the kernel must decode instructions manually and should not 162 + * be using this function. 163 + */ 164 + WARN_ONCE(1, "ve->instr_len is not defined for EPT violations"); 165 + return 0; 166 + default: 167 + WARN_ONCE(1, "Unexpected #VE-type: %lld\n", ve->exit_reason); 168 + return ve->instr_len; 169 + } 170 + } 171 + 127 172 static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti) 128 173 { 129 174 struct tdx_hypercall_args args = { ··· 192 147 return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0); 193 148 } 194 149 195 - static bool handle_halt(void) 150 + static int handle_halt(struct ve_info *ve) 196 151 { 197 152 /* 198 153 * Since non safe halt is mainly used in CPU offlining ··· 203 158 const bool do_sti = false; 204 159 205 160 if (__halt(irq_disabled, do_sti)) 206 - return false; 161 + return -EIO; 207 162 208 - return true; 163 + return ve_instr_len(ve); 209 164 } 210 165 211 166 void __cpuidle tdx_safe_halt(void) ··· 225 180 WARN_ONCE(1, "HLT instruction emulation failed\n"); 226 181 } 227 182 228 - static bool read_msr(struct pt_regs *regs) 183 + static int read_msr(struct pt_regs *regs, struct ve_info *ve) 229 184 { 230 185 struct tdx_hypercall_args args = { 231 186 .r10 = TDX_HYPERCALL_STANDARD, ··· 239 194 * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>". 240 195 */ 241 196 if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) 242 - return false; 197 + return -EIO; 243 198 244 199 regs->ax = lower_32_bits(args.r11); 245 200 regs->dx = upper_32_bits(args.r11); 246 - return true; 201 + return ve_instr_len(ve); 247 202 } 248 203 249 - static bool write_msr(struct pt_regs *regs) 204 + static int write_msr(struct pt_regs *regs, struct ve_info *ve) 250 205 { 251 206 struct tdx_hypercall_args args = { 252 207 .r10 = TDX_HYPERCALL_STANDARD, ··· 260 215 * can be found in TDX Guest-Host-Communication Interface 261 216 * (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>". 262 217 */ 263 - return !__tdx_hypercall(&args, 0); 218 + if (__tdx_hypercall(&args, 0)) 219 + return -EIO; 220 + 221 + return ve_instr_len(ve); 264 222 } 265 223 266 - static bool handle_cpuid(struct pt_regs *regs) 224 + static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve) 267 225 { 268 226 struct tdx_hypercall_args args = { 269 227 .r10 = TDX_HYPERCALL_STANDARD, ··· 284 236 */ 285 237 if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) { 286 238 regs->ax = regs->bx = regs->cx = regs->dx = 0; 287 - return true; 239 + return ve_instr_len(ve); 288 240 } 289 241 290 242 /* ··· 293 245 * (GHCI), section titled "VP.VMCALL<Instruction.CPUID>". 294 246 */ 295 247 if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) 296 - return false; 248 + return -EIO; 297 249 298 250 /* 299 251 * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of ··· 305 257 regs->cx = args.r14; 306 258 regs->dx = args.r15; 307 259 308 - return true; 260 + return ve_instr_len(ve); 309 261 } 310 262 311 263 static bool mmio_read(int size, unsigned long addr, unsigned long *val) ··· 331 283 EPT_WRITE, addr, val); 332 284 } 333 285 334 - static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve) 286 + static int handle_mmio(struct pt_regs *regs, struct ve_info *ve) 335 287 { 288 + unsigned long *reg, val, vaddr; 336 289 char buffer[MAX_INSN_SIZE]; 337 - unsigned long *reg, val; 338 290 struct insn insn = {}; 339 291 enum mmio_type mmio; 340 292 int size, extend_size; ··· 342 294 343 295 /* Only in-kernel MMIO is supported */ 344 296 if (WARN_ON_ONCE(user_mode(regs))) 345 - return false; 297 + return -EFAULT; 346 298 347 299 if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE)) 348 - return false; 300 + return -EFAULT; 349 301 350 302 if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64)) 351 - return false; 303 + return -EINVAL; 352 304 353 305 mmio = insn_decode_mmio(&insn, &size); 354 306 if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED)) 355 - return false; 307 + return -EINVAL; 356 308 357 309 if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) { 358 310 reg = insn_get_modrm_reg_ptr(&insn, regs); 359 311 if (!reg) 360 - return false; 312 + return -EINVAL; 361 313 } 362 314 363 - ve->instr_len = insn.length; 315 + /* 316 + * Reject EPT violation #VEs that split pages. 317 + * 318 + * MMIO accesses are supposed to be naturally aligned and therefore 319 + * never cross page boundaries. Seeing split page accesses indicates 320 + * a bug or a load_unaligned_zeropad() that stepped into an MMIO page. 321 + * 322 + * load_unaligned_zeropad() will recover using exception fixups. 323 + */ 324 + vaddr = (unsigned long)insn_get_addr_ref(&insn, regs); 325 + if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE) 326 + return -EFAULT; 364 327 365 328 /* Handle writes first */ 366 329 switch (mmio) { 367 330 case MMIO_WRITE: 368 331 memcpy(&val, reg, size); 369 - return mmio_write(size, ve->gpa, val); 332 + if (!mmio_write(size, ve->gpa, val)) 333 + return -EIO; 334 + return insn.length; 370 335 case MMIO_WRITE_IMM: 371 336 val = insn.immediate.value; 372 - return mmio_write(size, ve->gpa, val); 337 + if (!mmio_write(size, ve->gpa, val)) 338 + return -EIO; 339 + return insn.length; 373 340 case MMIO_READ: 374 341 case MMIO_READ_ZERO_EXTEND: 375 342 case MMIO_READ_SIGN_EXTEND: ··· 397 334 * decoded or handled properly. It was likely not using io.h 398 335 * helpers or accessed MMIO accidentally. 399 336 */ 400 - return false; 337 + return -EINVAL; 401 338 default: 402 339 WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?"); 403 - return false; 340 + return -EINVAL; 404 341 } 405 342 406 343 /* Handle reads */ 407 344 if (!mmio_read(size, ve->gpa, &val)) 408 - return false; 345 + return -EIO; 409 346 410 347 switch (mmio) { 411 348 case MMIO_READ: ··· 427 364 default: 428 365 /* All other cases has to be covered with the first switch() */ 429 366 WARN_ON_ONCE(1); 430 - return false; 367 + return -EINVAL; 431 368 } 432 369 433 370 if (extend_size) 434 371 memset(reg, extend_val, extend_size); 435 372 memcpy(reg, &val, size); 436 - return true; 373 + return insn.length; 437 374 } 438 375 439 376 static bool handle_in(struct pt_regs *regs, int size, int port) ··· 484 421 * 485 422 * Return True on success or False on failure. 486 423 */ 487 - static bool handle_io(struct pt_regs *regs, u32 exit_qual) 424 + static int handle_io(struct pt_regs *regs, struct ve_info *ve) 488 425 { 426 + u32 exit_qual = ve->exit_qual; 489 427 int size, port; 490 - bool in; 428 + bool in, ret; 491 429 492 430 if (VE_IS_IO_STRING(exit_qual)) 493 - return false; 431 + return -EIO; 494 432 495 433 in = VE_IS_IO_IN(exit_qual); 496 434 size = VE_GET_IO_SIZE(exit_qual); ··· 499 435 500 436 501 437 if (in) 502 - return handle_in(regs, size, port); 438 + ret = handle_in(regs, size, port); 503 439 else 504 - return handle_out(regs, size, port); 440 + ret = handle_out(regs, size, port); 441 + if (!ret) 442 + return -EIO; 443 + 444 + return ve_instr_len(ve); 505 445 } 506 446 507 447 /* ··· 515 447 __init bool tdx_early_handle_ve(struct pt_regs *regs) 516 448 { 517 449 struct ve_info ve; 450 + int insn_len; 518 451 519 452 tdx_get_ve_info(&ve); 520 453 521 454 if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION) 522 455 return false; 523 456 524 - return handle_io(regs, ve.exit_qual); 457 + insn_len = handle_io(regs, &ve); 458 + if (insn_len < 0) 459 + return false; 460 + 461 + regs->ip += insn_len; 462 + return true; 525 463 } 526 464 527 465 void tdx_get_ve_info(struct ve_info *ve) ··· 560 486 ve->instr_info = upper_32_bits(out.r10); 561 487 } 562 488 563 - /* Handle the user initiated #VE */ 564 - static bool virt_exception_user(struct pt_regs *regs, struct ve_info *ve) 489 + /* 490 + * Handle the user initiated #VE. 491 + * 492 + * On success, returns the number of bytes RIP should be incremented (>=0) 493 + * or -errno on error. 494 + */ 495 + static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve) 565 496 { 566 497 switch (ve->exit_reason) { 567 498 case EXIT_REASON_CPUID: 568 - return handle_cpuid(regs); 499 + return handle_cpuid(regs, ve); 569 500 default: 570 501 pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); 571 - return false; 502 + return -EIO; 572 503 } 573 504 } 574 505 575 - /* Handle the kernel #VE */ 576 - static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) 506 + /* 507 + * Handle the kernel #VE. 508 + * 509 + * On success, returns the number of bytes RIP should be incremented (>=0) 510 + * or -errno on error. 511 + */ 512 + static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) 577 513 { 578 514 switch (ve->exit_reason) { 579 515 case EXIT_REASON_HLT: 580 - return handle_halt(); 516 + return handle_halt(ve); 581 517 case EXIT_REASON_MSR_READ: 582 - return read_msr(regs); 518 + return read_msr(regs, ve); 583 519 case EXIT_REASON_MSR_WRITE: 584 - return write_msr(regs); 520 + return write_msr(regs, ve); 585 521 case EXIT_REASON_CPUID: 586 - return handle_cpuid(regs); 522 + return handle_cpuid(regs, ve); 587 523 case EXIT_REASON_EPT_VIOLATION: 588 524 return handle_mmio(regs, ve); 589 525 case EXIT_REASON_IO_INSTRUCTION: 590 - return handle_io(regs, ve->exit_qual); 526 + return handle_io(regs, ve); 591 527 default: 592 528 pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); 593 - return false; 529 + return -EIO; 594 530 } 595 531 } 596 532 597 533 bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve) 598 534 { 599 - bool ret; 535 + int insn_len; 600 536 601 537 if (user_mode(regs)) 602 - ret = virt_exception_user(regs, ve); 538 + insn_len = virt_exception_user(regs, ve); 603 539 else 604 - ret = virt_exception_kernel(regs, ve); 540 + insn_len = virt_exception_kernel(regs, ve); 541 + if (insn_len < 0) 542 + return false; 605 543 606 544 /* After successful #VE handling, move the IP */ 607 - if (ret) 608 - regs->ip += ve->instr_len; 545 + regs->ip += insn_len; 609 546 610 - return ret; 547 + return true; 611 548 } 612 549 613 550 static bool tdx_tlb_flush_required(bool private)
+21 -17
arch/x86/include/asm/setup.h
··· 108 108 void *extend_brk(size_t size, size_t align); 109 109 110 110 /* 111 - * Reserve space in the brk section. The name must be unique within the file, 112 - * and somewhat descriptive. The size is in bytes. 111 + * Reserve space in the .brk section, which is a block of memory from which the 112 + * caller is allowed to allocate very early (before even memblock is available) 113 + * by calling extend_brk(). All allocated memory will be eventually converted 114 + * to memblock. Any leftover unallocated memory will be freed. 113 115 * 114 - * The allocation is done using inline asm (rather than using a section 115 - * attribute on a normal variable) in order to allow the use of @nobits, so 116 - * that it doesn't take up any space in the vmlinux file. 116 + * The size is in bytes. 117 117 */ 118 - #define RESERVE_BRK(name, size) \ 119 - asm(".pushsection .brk_reservation,\"aw\",@nobits\n\t" \ 120 - ".brk." #name ":\n\t" \ 121 - ".skip " __stringify(size) "\n\t" \ 122 - ".size .brk." #name ", " __stringify(size) "\n\t" \ 123 - ".popsection\n\t") 118 + #define RESERVE_BRK(name, size) \ 119 + __section(".bss..brk") __aligned(1) __used \ 120 + static char __brk_##name[size] 124 121 125 122 extern void probe_roms(void); 126 123 #ifdef __i386__ ··· 130 133 131 134 #endif /* __i386__ */ 132 135 #endif /* _SETUP */ 133 - #else 134 - #define RESERVE_BRK(name,sz) \ 135 - .pushsection .brk_reservation,"aw",@nobits; \ 136 - .brk.name: \ 137 - 1: .skip sz; \ 138 - .size .brk.name,.-1b; \ 136 + 137 + #else /* __ASSEMBLY */ 138 + 139 + .macro __RESERVE_BRK name, size 140 + .pushsection .bss..brk, "aw" 141 + SYM_DATA_START(__brk_\name) 142 + .skip \size 143 + SYM_DATA_END(__brk_\name) 139 144 .popsection 145 + .endm 146 + 147 + #define RESERVE_BRK(name, size) __RESERVE_BRK name, size 148 + 140 149 #endif /* __ASSEMBLY__ */ 150 + 141 151 #endif /* _ASM_X86_SETUP_H */
-5
arch/x86/kernel/setup.c
··· 67 67 #endif 68 68 69 69 70 - /* 71 - * Range of the BSS area. The size of the BSS area is determined 72 - * at link time, with RESERVE_BRK() facility reserving additional 73 - * chunks. 74 - */ 75 70 unsigned long _brk_start = (unsigned long)__brk_base; 76 71 unsigned long _brk_end = (unsigned long)__brk_base; 77 72
+2 -2
arch/x86/kernel/vmlinux.lds.S
··· 385 385 __end_of_kernel_reserve = .; 386 386 387 387 . = ALIGN(PAGE_SIZE); 388 - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { 388 + .brk (NOLOAD) : AT(ADDR(.brk) - LOAD_OFFSET) { 389 389 __brk_base = .; 390 390 . += 64 * 1024; /* 64k alignment slop space */ 391 - *(.brk_reservation) /* areas brk users have reserved */ 391 + *(.bss..brk) /* areas brk users have reserved */ 392 392 __brk_limit = .; 393 393 } 394 394