Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

x86/hyperv: Add kexec/kdump support on Azure CVMs

Azure CVM instance types featuring a paravisor hang upon kdump. The
investigation shows that makedumpfile causes a hang when it steps on a page
which was previously share with the host
(HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY). The new kernel has no
knowledge of these 'special' regions (which are Vmbus connection pages,
GPADL buffers, ...). There are several ways to approach the issue:
- Convey the knowledge about these regions to the new kernel somehow.
- Unshare these regions before accessing in the new kernel (it is unclear
if there's a way to query the status for a given GPA range).
- Unshare these regions before jumping to the new kernel (which this patch
implements).

To make the procedure as robust as possible, store PFN ranges of shared
regions in a linked list instead of storing GVAs and re-using
hv_vtom_set_host_visibility(). This also allows to avoid memory allocation
on the kdump/kexec path.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Reviewed-by: Tianyu Lan <tiala@microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>

authored by

Vitaly Kuznetsov and committed by
Wei Liu
7ad8c34f 2d0ddbb6

+210 -1
+210 -1
arch/x86/hyperv/ivm.c
··· 463 463 } 464 464 465 465 /* 466 + * Keep track of the PFN regions which were shared with the host. The access 467 + * must be revoked upon kexec/kdump (see hv_ivm_clear_host_access()). 468 + */ 469 + struct hv_enc_pfn_region { 470 + struct list_head list; 471 + u64 pfn; 472 + int count; 473 + }; 474 + 475 + static LIST_HEAD(hv_list_enc); 476 + static DEFINE_RAW_SPINLOCK(hv_list_enc_lock); 477 + 478 + static int hv_list_enc_add(const u64 *pfn_list, int count) 479 + { 480 + struct hv_enc_pfn_region *ent; 481 + unsigned long flags; 482 + u64 pfn; 483 + int i; 484 + 485 + for (i = 0; i < count; i++) { 486 + pfn = pfn_list[i]; 487 + 488 + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); 489 + /* Check if the PFN already exists in some region first */ 490 + list_for_each_entry(ent, &hv_list_enc, list) { 491 + if ((ent->pfn <= pfn) && (ent->pfn + ent->count - 1 >= pfn)) 492 + /* Nothing to do - pfn is already in the list */ 493 + goto unlock_done; 494 + } 495 + 496 + /* 497 + * Check if the PFN is adjacent to an existing region. Growing 498 + * a region can make it adjacent to another one but merging is 499 + * not (yet) implemented for simplicity. A PFN cannot be added 500 + * to two regions to keep the logic in hv_list_enc_remove() 501 + * correct. 502 + */ 503 + list_for_each_entry(ent, &hv_list_enc, list) { 504 + if (ent->pfn + ent->count == pfn) { 505 + /* Grow existing region up */ 506 + ent->count++; 507 + goto unlock_done; 508 + } else if (pfn + 1 == ent->pfn) { 509 + /* Grow existing region down */ 510 + ent->pfn--; 511 + ent->count++; 512 + goto unlock_done; 513 + } 514 + } 515 + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); 516 + 517 + /* No adjacent region found -- create a new one */ 518 + ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL); 519 + if (!ent) 520 + return -ENOMEM; 521 + 522 + ent->pfn = pfn; 523 + ent->count = 1; 524 + 525 + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); 526 + list_add(&ent->list, &hv_list_enc); 527 + 528 + unlock_done: 529 + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); 530 + } 531 + 532 + return 0; 533 + } 534 + 535 + static int hv_list_enc_remove(const u64 *pfn_list, int count) 536 + { 537 + struct hv_enc_pfn_region *ent, *t; 538 + struct hv_enc_pfn_region new_region; 539 + unsigned long flags; 540 + u64 pfn; 541 + int i; 542 + 543 + for (i = 0; i < count; i++) { 544 + pfn = pfn_list[i]; 545 + 546 + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); 547 + list_for_each_entry_safe(ent, t, &hv_list_enc, list) { 548 + if (pfn == ent->pfn + ent->count - 1) { 549 + /* Removing tail pfn */ 550 + ent->count--; 551 + if (!ent->count) { 552 + list_del(&ent->list); 553 + kfree(ent); 554 + } 555 + goto unlock_done; 556 + } else if (pfn == ent->pfn) { 557 + /* Removing head pfn */ 558 + ent->count--; 559 + ent->pfn++; 560 + if (!ent->count) { 561 + list_del(&ent->list); 562 + kfree(ent); 563 + } 564 + goto unlock_done; 565 + } else if (pfn > ent->pfn && pfn < ent->pfn + ent->count - 1) { 566 + /* 567 + * Removing a pfn in the middle. Cut off the tail 568 + * of the existing region and create a template for 569 + * the new one. 570 + */ 571 + new_region.pfn = pfn + 1; 572 + new_region.count = ent->count - (pfn - ent->pfn + 1); 573 + ent->count = pfn - ent->pfn; 574 + goto unlock_split; 575 + } 576 + 577 + } 578 + unlock_done: 579 + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); 580 + continue; 581 + 582 + unlock_split: 583 + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); 584 + 585 + ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL); 586 + if (!ent) 587 + return -ENOMEM; 588 + 589 + ent->pfn = new_region.pfn; 590 + ent->count = new_region.count; 591 + 592 + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); 593 + list_add(&ent->list, &hv_list_enc); 594 + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); 595 + } 596 + 597 + return 0; 598 + } 599 + 600 + /* Stop new private<->shared conversions */ 601 + static void hv_vtom_kexec_begin(void) 602 + { 603 + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) 604 + return; 605 + 606 + /* 607 + * Crash kernel reaches here with interrupts disabled: can't wait for 608 + * conversions to finish. 609 + * 610 + * If race happened, just report and proceed. 611 + */ 612 + if (!set_memory_enc_stop_conversion()) 613 + pr_warn("Failed to stop shared<->private conversions\n"); 614 + } 615 + 616 + static void hv_vtom_kexec_finish(void) 617 + { 618 + struct hv_gpa_range_for_visibility *input; 619 + struct hv_enc_pfn_region *ent; 620 + unsigned long flags; 621 + u64 hv_status; 622 + int cur, i; 623 + 624 + local_irq_save(flags); 625 + input = *this_cpu_ptr(hyperv_pcpu_input_arg); 626 + 627 + if (unlikely(!input)) 628 + goto out; 629 + 630 + list_for_each_entry(ent, &hv_list_enc, list) { 631 + for (i = 0, cur = 0; i < ent->count; i++) { 632 + input->gpa_page_list[cur] = ent->pfn + i; 633 + cur++; 634 + 635 + if (cur == HV_MAX_MODIFY_GPA_REP_COUNT || i == ent->count - 1) { 636 + input->partition_id = HV_PARTITION_ID_SELF; 637 + input->host_visibility = VMBUS_PAGE_NOT_VISIBLE; 638 + input->reserved0 = 0; 639 + input->reserved1 = 0; 640 + hv_status = hv_do_rep_hypercall( 641 + HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY, 642 + cur, 0, input, NULL); 643 + WARN_ON_ONCE(!hv_result_success(hv_status)); 644 + cur = 0; 645 + } 646 + } 647 + 648 + } 649 + 650 + out: 651 + local_irq_restore(flags); 652 + } 653 + 654 + /* 466 655 * hv_mark_gpa_visibility - Set pages visible to host via hvcall. 467 656 * 468 657 * In Isolation VM, all guest memory is encrypted from host and guest ··· 664 475 struct hv_gpa_range_for_visibility *input; 665 476 u64 hv_status; 666 477 unsigned long flags; 478 + int ret; 667 479 668 480 /* no-op if partition isolation is not enabled */ 669 481 if (!hv_is_isolation_supported()) ··· 675 485 HV_MAX_MODIFY_GPA_REP_COUNT); 676 486 return -EINVAL; 677 487 } 488 + 489 + if (visibility == VMBUS_PAGE_NOT_VISIBLE) 490 + ret = hv_list_enc_remove(pfn, count); 491 + else 492 + ret = hv_list_enc_add(pfn, count); 493 + if (ret) 494 + return ret; 678 495 679 496 local_irq_save(flags); 680 497 input = *this_cpu_ptr(hyperv_pcpu_input_arg); ··· 703 506 704 507 if (hv_result_success(hv_status)) 705 508 return 0; 509 + 510 + if (visibility == VMBUS_PAGE_NOT_VISIBLE) 511 + ret = hv_list_enc_add(pfn, count); 706 512 else 707 - return -EFAULT; 513 + ret = hv_list_enc_remove(pfn, count); 514 + /* 515 + * There's no good way to recover from -ENOMEM here, the accounting is 516 + * wrong either way. 517 + */ 518 + WARN_ON_ONCE(ret); 519 + 520 + return -EFAULT; 708 521 } 709 522 710 523 /* ··· 876 669 x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required; 877 670 x86_platform.guest.enc_status_change_prepare = hv_vtom_clear_present; 878 671 x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility; 672 + x86_platform.guest.enc_kexec_begin = hv_vtom_kexec_begin; 673 + x86_platform.guest.enc_kexec_finish = hv_vtom_kexec_finish; 879 674 880 675 /* Set WB as the default cache mode. */ 881 676 guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK);