Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

+11 -1

Documentation/virtual/kvm/api.txt

··· 4510 4510 Architectures: s390 4511 4511 Parameters: none 4512 4512 Returns: 0 on success, -EINVAL if hpage module parameter was not set 4513 - or cmma is enabled 4513 + or cmma is enabled, or the VM has the KVM_VM_S390_UCONTROL 4514 + flag set 4514 4515 4515 4516 With this capability the KVM support for memory backing with 1m pages 4516 4517 through hugetlbfs can be enabled for a VM. After the capability is ··· 4521 4520 4522 4521 While it is generally possible to create a huge page backed VM without 4523 4522 this capability, the VM will not be able to run. 4523 + 4524 + 7.14 KVM_CAP_MSR_PLATFORM_INFO 4525 + 4526 + Architectures: x86 4527 + Parameters: args[0] whether feature should be enabled or not 4528 + 4529 + With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise, 4530 + a #GP would be raised when the guest tries to access. Currently, this 4531 + capability does not enable write permissions of this MSR for the guest. 4524 4532 4525 4533 8. Other capabilities. 4526 4534 ----------------------

-1

arch/powerpc/include/asm/book3s/64/pgtable.h

··· 1051 1051 return hash__vmemmap_remove_mapping(start, page_size); 1052 1052 } 1053 1053 #endif 1054 - struct page *realmode_pfn_to_page(unsigned long pfn); 1055 1054 1056 1055 static inline pte_t pmd_pte(pmd_t pmd) 1057 1056 {

-2

arch/powerpc/include/asm/iommu.h

··· 220 220 extern int __init tce_iommu_bus_notifier_init(void); 221 221 extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, 222 222 unsigned long *hpa, enum dma_data_direction *direction); 223 - extern long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry, 224 - unsigned long *hpa, enum dma_data_direction *direction); 225 223 #else 226 224 static inline void iommu_register_group(struct iommu_table_group *table_group, 227 225 int pci_domain_number,

+1

arch/powerpc/include/asm/mmu_context.h

··· 38 38 unsigned long ua, unsigned int pageshift, unsigned long *hpa); 39 39 extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, 40 40 unsigned long ua, unsigned int pageshift, unsigned long *hpa); 41 + extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua); 41 42 extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem); 42 43 extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem); 43 44 #endif

-25

arch/powerpc/kernel/iommu.c

··· 1013 1013 } 1014 1014 EXPORT_SYMBOL_GPL(iommu_tce_xchg); 1015 1015 1016 - #ifdef CONFIG_PPC_BOOK3S_64 1017 - long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry, 1018 - unsigned long *hpa, enum dma_data_direction *direction) 1019 - { 1020 - long ret; 1021 - 1022 - ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); 1023 - 1024 - if (!ret && ((*direction == DMA_FROM_DEVICE) || 1025 - (*direction == DMA_BIDIRECTIONAL))) { 1026 - struct page *pg = realmode_pfn_to_page(*hpa >> PAGE_SHIFT); 1027 - 1028 - if (likely(pg)) { 1029 - SetPageDirty(pg); 1030 - } else { 1031 - tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); 1032 - ret = -EFAULT; 1033 - } 1034 - } 1035 - 1036 - return ret; 1037 - } 1038 - EXPORT_SYMBOL_GPL(iommu_tce_xchg_rm); 1039 - #endif 1040 - 1041 1016 int iommu_take_ownership(struct iommu_table *tbl) 1042 1017 { 1043 1018 unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;

+37 -54

arch/powerpc/kvm/book3s_64_mmu_radix.c

··· 525 525 unsigned long ea, unsigned long dsisr) 526 526 { 527 527 struct kvm *kvm = vcpu->kvm; 528 - unsigned long mmu_seq, pte_size; 529 - unsigned long gpa, gfn, hva, pfn; 528 + unsigned long mmu_seq; 529 + unsigned long gpa, gfn, hva; 530 530 struct kvm_memory_slot *memslot; 531 531 struct page *page = NULL; 532 532 long ret; ··· 623 623 */ 624 624 hva = gfn_to_hva_memslot(memslot, gfn); 625 625 if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) { 626 - pfn = page_to_pfn(page); 627 626 upgrade_write = true; 628 627 } else { 628 + unsigned long pfn; 629 + 629 630 /* Call KVM generic code to do the slow-path check */ 630 631 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, 631 632 writing, upgrade_p); ··· 640 639 } 641 640 } 642 641 643 - /* See if we can insert a 1GB or 2MB large PTE here */ 644 - level = 0; 645 - if (page && PageCompound(page)) { 646 - pte_size = PAGE_SIZE << compound_order(compound_head(page)); 647 - if (pte_size >= PUD_SIZE && 648 - (gpa & (PUD_SIZE - PAGE_SIZE)) == 649 - (hva & (PUD_SIZE - PAGE_SIZE))) { 650 - level = 2; 651 - pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1); 652 - } else if (pte_size >= PMD_SIZE && 653 - (gpa & (PMD_SIZE - PAGE_SIZE)) == 654 - (hva & (PMD_SIZE - PAGE_SIZE))) { 655 - level = 1; 656 - pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); 642 + /* 643 + * Read the PTE from the process' radix tree and use that 644 + * so we get the shift and attribute bits. 645 + */ 646 + local_irq_disable(); 647 + ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); 648 + pte = *ptep; 649 + local_irq_enable(); 650 + 651 + /* Get pte level from shift/size */ 652 + if (shift == PUD_SHIFT && 653 + (gpa & (PUD_SIZE - PAGE_SIZE)) == 654 + (hva & (PUD_SIZE - PAGE_SIZE))) { 655 + level = 2; 656 + } else if (shift == PMD_SHIFT && 657 + (gpa & (PMD_SIZE - PAGE_SIZE)) == 658 + (hva & (PMD_SIZE - PAGE_SIZE))) { 659 + level = 1; 660 + } else { 661 + level = 0; 662 + if (shift > PAGE_SHIFT) { 663 + /* 664 + * If the pte maps more than one page, bring over 665 + * bits from the virtual address to get the real 666 + * address of the specific single page we want. 667 + */ 668 + unsigned long rpnmask = (1ul << shift) - PAGE_SIZE; 669 + pte = __pte(pte_val(pte) | (hva & rpnmask)); 657 670 } 658 671 } 659 672 660 - /* 661 - * Compute the PTE value that we need to insert. 662 - */ 663 - if (page) { 664 - pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE | 665 - _PAGE_ACCESSED; 666 - if (writing || upgrade_write) 667 - pgflags |= _PAGE_WRITE | _PAGE_DIRTY; 668 - pte = pfn_pte(pfn, __pgprot(pgflags)); 673 + pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED); 674 + if (writing || upgrade_write) { 675 + if (pte_val(pte) & _PAGE_WRITE) 676 + pte = __pte(pte_val(pte) | _PAGE_DIRTY); 669 677 } else { 670 - /* 671 - * Read the PTE from the process' radix tree and use that 672 - * so we get the attribute bits. 673 - */ 674 - local_irq_disable(); 675 - ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); 676 - pte = *ptep; 677 - local_irq_enable(); 678 - if (shift == PUD_SHIFT && 679 - (gpa & (PUD_SIZE - PAGE_SIZE)) == 680 - (hva & (PUD_SIZE - PAGE_SIZE))) { 681 - level = 2; 682 - } else if (shift == PMD_SHIFT && 683 - (gpa & (PMD_SIZE - PAGE_SIZE)) == 684 - (hva & (PMD_SIZE - PAGE_SIZE))) { 685 - level = 1; 686 - } else if (shift && shift != PAGE_SHIFT) { 687 - /* Adjust PFN */ 688 - unsigned long mask = (1ul << shift) - PAGE_SIZE; 689 - pte = __pte(pte_val(pte) | (hva & mask)); 690 - } 691 - pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED); 692 - if (writing || upgrade_write) { 693 - if (pte_val(pte) & _PAGE_WRITE) 694 - pte = __pte(pte_val(pte) | _PAGE_DIRTY); 695 - } else { 696 - pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY)); 697 - } 678 + pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY)); 698 679 } 699 680 700 681 /* Allocate space in the tree and write the PTE */

+31 -8

arch/powerpc/kvm/book3s_64_vio_hv.c

··· 187 187 EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); 188 188 189 189 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 190 - static void kvmppc_rm_clear_tce(struct iommu_table *tbl, unsigned long entry) 190 + static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, 191 + unsigned long entry, unsigned long *hpa, 192 + enum dma_data_direction *direction) 193 + { 194 + long ret; 195 + 196 + ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); 197 + 198 + if (!ret && ((*direction == DMA_FROM_DEVICE) || 199 + (*direction == DMA_BIDIRECTIONAL))) { 200 + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); 201 + /* 202 + * kvmppc_rm_tce_iommu_do_map() updates the UA cache after 203 + * calling this so we still get here a valid UA. 204 + */ 205 + if (pua && *pua) 206 + mm_iommu_ua_mark_dirty_rm(mm, be64_to_cpu(*pua)); 207 + } 208 + 209 + return ret; 210 + } 211 + 212 + static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl, 213 + unsigned long entry) 191 214 { 192 215 unsigned long hpa = 0; 193 216 enum dma_data_direction dir = DMA_NONE; 194 217 195 - iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); 218 + iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); 196 219 } 197 220 198 221 static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, ··· 247 224 unsigned long hpa = 0; 248 225 long ret; 249 226 250 - if (iommu_tce_xchg_rm(tbl, entry, &hpa, &dir)) 227 + if (iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir)) 251 228 /* 252 229 * real mode xchg can fail if struct page crosses 253 230 * a page boundary ··· 259 236 260 237 ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); 261 238 if (ret) 262 - iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); 239 + iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); 263 240 264 241 return ret; 265 242 } ··· 305 282 if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) 306 283 return H_CLOSED; 307 284 308 - ret = iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); 285 + ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); 309 286 if (ret) { 310 287 mm_iommu_mapped_dec(mem); 311 288 /* ··· 394 371 return ret; 395 372 396 373 WARN_ON_ONCE_RM(1); 397 - kvmppc_rm_clear_tce(stit->tbl, entry); 374 + kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); 398 375 } 399 376 400 377 kvmppc_tce_put(stt, entry, tce); ··· 543 520 goto unlock_exit; 544 521 545 522 WARN_ON_ONCE_RM(1); 546 - kvmppc_rm_clear_tce(stit->tbl, entry); 523 + kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); 547 524 } 548 525 549 526 kvmppc_tce_put(stt, entry + i, tce); ··· 594 571 return ret; 595 572 596 573 WARN_ON_ONCE_RM(1); 597 - kvmppc_rm_clear_tce(stit->tbl, entry); 574 + kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); 598 575 } 599 576 } 600 577

-49

arch/powerpc/mm/init_64.c

··· 308 308 { 309 309 } 310 310 311 - /* 312 - * We do not have access to the sparsemem vmemmap, so we fallback to 313 - * walking the list of sparsemem blocks which we already maintain for 314 - * the sake of crashdump. In the long run, we might want to maintain 315 - * a tree if performance of that linear walk becomes a problem. 316 - * 317 - * realmode_pfn_to_page functions can fail due to: 318 - * 1) As real sparsemem blocks do not lay in RAM continously (they 319 - * are in virtual address space which is not available in the real mode), 320 - * the requested page struct can be split between blocks so get_page/put_page 321 - * may fail. 322 - * 2) When huge pages are used, the get_page/put_page API will fail 323 - * in real mode as the linked addresses in the page struct are virtual 324 - * too. 325 - */ 326 - struct page *realmode_pfn_to_page(unsigned long pfn) 327 - { 328 - struct vmemmap_backing *vmem_back; 329 - struct page *page; 330 - unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; 331 - unsigned long pg_va = (unsigned long) pfn_to_page(pfn); 332 - 333 - for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back->list) { 334 - if (pg_va < vmem_back->virt_addr) 335 - continue; 336 - 337 - /* After vmemmap_list entry free is possible, need check all */ 338 - if ((pg_va + sizeof(struct page)) <= 339 - (vmem_back->virt_addr + page_size)) { 340 - page = (struct page *) (vmem_back->phys + pg_va - 341 - vmem_back->virt_addr); 342 - return page; 343 - } 344 - } 345 - 346 - /* Probably that page struct is split between real pages */ 347 - return NULL; 348 - } 349 - EXPORT_SYMBOL_GPL(realmode_pfn_to_page); 350 - 351 - #else 352 - 353 - struct page *realmode_pfn_to_page(unsigned long pfn) 354 - { 355 - struct page *page = pfn_to_page(pfn); 356 - return page; 357 - } 358 - EXPORT_SYMBOL_GPL(realmode_pfn_to_page); 359 - 360 311 #endif /* CONFIG_SPARSEMEM_VMEMMAP */ 361 312 362 313 #ifdef CONFIG_PPC_BOOK3S_64

+30 -4

arch/powerpc/mm/mmu_context_iommu.c

··· 18 18 #include <linux/migrate.h> 19 19 #include <linux/hugetlb.h> 20 20 #include <linux/swap.h> 21 + #include <linux/sizes.h> 21 22 #include <asm/mmu_context.h> 22 23 #include <asm/pte-walk.h> 23 24 24 25 static DEFINE_MUTEX(mem_list_mutex); 26 + 27 + #define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY 0x1 28 + #define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1) 25 29 26 30 struct mm_iommu_table_group_mem_t { 27 31 struct list_head next; ··· 267 263 if (!page) 268 264 continue; 269 265 266 + if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY) 267 + SetPageDirty(page); 268 + 270 269 put_page(page); 271 270 mem->hpas[i] = 0; 272 271 } ··· 367 360 368 361 return ret; 369 362 } 370 - EXPORT_SYMBOL_GPL(mm_iommu_lookup_rm); 371 363 372 364 struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, 373 365 unsigned long ua, unsigned long entries) ··· 396 390 if (pageshift > mem->pageshift) 397 391 return -EFAULT; 398 392 399 - *hpa = *va | (ua & ~PAGE_MASK); 393 + *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); 400 394 401 395 return 0; 402 396 } ··· 419 413 if (!pa) 420 414 return -EFAULT; 421 415 422 - *hpa = *pa | (ua & ~PAGE_MASK); 416 + *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); 423 417 424 418 return 0; 425 419 } 426 - EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa_rm); 420 + 421 + extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua) 422 + { 423 + struct mm_iommu_table_group_mem_t *mem; 424 + long entry; 425 + void *va; 426 + unsigned long *pa; 427 + 428 + mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE); 429 + if (!mem) 430 + return; 431 + 432 + entry = (ua - mem->ua) >> PAGE_SHIFT; 433 + va = &mem->hpas[entry]; 434 + 435 + pa = (void *) vmalloc_to_phys(va); 436 + if (!pa) 437 + return; 438 + 439 + *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY; 440 + } 427 441 428 442 long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem) 429 443 {

+2 -2

arch/s390/kvm/kvm-s390.c

··· 481 481 break; 482 482 case KVM_CAP_S390_HPAGE_1M: 483 483 r = 0; 484 - if (hpage) 484 + if (hpage && !kvm_is_ucontrol(kvm)) 485 485 r = 1; 486 486 break; 487 487 case KVM_CAP_S390_MEM_OP: ··· 691 691 mutex_lock(&kvm->lock); 692 692 if (kvm->created_vcpus) 693 693 r = -EBUSY; 694 - else if (!hpage || kvm->arch.use_cmma) 694 + else if (!hpage || kvm->arch.use_cmma || kvm_is_ucontrol(kvm)) 695 695 r = -EINVAL; 696 696 else { 697 697 r = 0;

+3 -1

arch/s390/mm/gmap.c

··· 708 708 vmaddr |= gaddr & ~PMD_MASK; 709 709 /* Find vma in the parent mm */ 710 710 vma = find_vma(gmap->mm, vmaddr); 711 + if (!vma) 712 + continue; 711 713 /* 712 714 * We do not discard pages that are backed by 713 715 * hugetlbfs, so we don't have to refault them. 714 716 */ 715 - if (vma && is_vm_hugetlb_page(vma)) 717 + if (is_vm_hugetlb_page(vma)) 716 718 continue; 717 719 size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); 718 720 zap_page_range(vma, vmaddr, size);

+4 -4

arch/x86/hyperv/hv_apic.c

··· 95 95 */ 96 96 static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) 97 97 { 98 - struct ipi_arg_ex **arg; 99 - struct ipi_arg_ex *ipi_arg; 98 + struct hv_send_ipi_ex **arg; 99 + struct hv_send_ipi_ex *ipi_arg; 100 100 unsigned long flags; 101 101 int nr_bank = 0; 102 102 int ret = 1; ··· 105 105 return false; 106 106 107 107 local_irq_save(flags); 108 - arg = (struct ipi_arg_ex **)this_cpu_ptr(hyperv_pcpu_input_arg); 108 + arg = (struct hv_send_ipi_ex **)this_cpu_ptr(hyperv_pcpu_input_arg); 109 109 110 110 ipi_arg = *arg; 111 111 if (unlikely(!ipi_arg)) ··· 135 135 static bool __send_ipi_mask(const struct cpumask *mask, int vector) 136 136 { 137 137 int cur_cpu, vcpu; 138 - struct ipi_arg_non_ex ipi_arg; 138 + struct hv_send_ipi ipi_arg; 139 139 int ret = 1; 140 140 141 141 trace_hyperv_send_ipi_mask(mask, vector);

+9 -7

arch/x86/include/asm/hyperv-tlfs.h

··· 726 726 #define HV_STIMER_AUTOENABLE (1ULL << 3) 727 727 #define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F) 728 728 729 - struct ipi_arg_non_ex { 730 - u32 vector; 731 - u32 reserved; 732 - u64 cpu_mask; 733 - }; 734 - 735 729 struct hv_vpset { 736 730 u64 format; 737 731 u64 valid_bank_mask; 738 732 u64 bank_contents[]; 739 733 }; 740 734 741 - struct ipi_arg_ex { 735 + /* HvCallSendSyntheticClusterIpi hypercall */ 736 + struct hv_send_ipi { 737 + u32 vector; 738 + u32 reserved; 739 + u64 cpu_mask; 740 + }; 741 + 742 + /* HvCallSendSyntheticClusterIpiEx hypercall */ 743 + struct hv_send_ipi_ex { 742 744 u32 vector; 743 745 u32 reserved; 744 746 struct hv_vpset vp_set;

+5

arch/x86/include/asm/kvm_host.h

··· 869 869 870 870 bool x2apic_format; 871 871 bool x2apic_broadcast_quirk_disabled; 872 + 873 + bool guest_can_read_msr_platform_info; 872 874 }; 873 875 874 876 struct kvm_vm_stat { ··· 1024 1022 void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); 1025 1023 void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); 1026 1024 void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); 1025 + bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); 1027 1026 void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); 1028 1027 void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); 1029 1028 void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); ··· 1058 1055 bool (*umip_emulated)(void); 1059 1056 1060 1057 int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); 1058 + void (*request_immediate_exit)(struct kvm_vcpu *vcpu); 1061 1059 1062 1060 void (*sched_in)(struct kvm_vcpu *kvm, int cpu); 1063 1061 ··· 1486 1482 1487 1483 int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); 1488 1484 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); 1485 + void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); 1489 1486 1490 1487 int kvm_is_in_guest(void); 1491 1488

+1

arch/x86/include/uapi/asm/kvm.h

··· 377 377 378 378 #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) 379 379 #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) 380 + #define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) 380 381 381 382 #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 382 383 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002

+19 -3

arch/x86/kvm/lapic.c

··· 1344 1344 1345 1345 static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) 1346 1346 { 1347 - return kvm_apic_hw_enabled(apic) && 1348 - addr >= apic->base_address && 1349 - addr < apic->base_address + LAPIC_MMIO_LENGTH; 1347 + return addr >= apic->base_address && 1348 + addr < apic->base_address + LAPIC_MMIO_LENGTH; 1350 1349 } 1351 1350 1352 1351 static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, ··· 1356 1357 1357 1358 if (!apic_mmio_in_range(apic, address)) 1358 1359 return -EOPNOTSUPP; 1360 + 1361 + if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) { 1362 + if (!kvm_check_has_quirk(vcpu->kvm, 1363 + KVM_X86_QUIRK_LAPIC_MMIO_HOLE)) 1364 + return -EOPNOTSUPP; 1365 + 1366 + memset(data, 0xff, len); 1367 + return 0; 1368 + } 1359 1369 1360 1370 kvm_lapic_reg_read(apic, offset, len, data); 1361 1371 ··· 1924 1916 1925 1917 if (!apic_mmio_in_range(apic, address)) 1926 1918 return -EOPNOTSUPP; 1919 + 1920 + if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) { 1921 + if (!kvm_check_has_quirk(vcpu->kvm, 1922 + KVM_X86_QUIRK_LAPIC_MMIO_HOLE)) 1923 + return -EOPNOTSUPP; 1924 + 1925 + return 0; 1926 + } 1927 1927 1928 1928 /* 1929 1929 * APIC register must be aligned on 128-bits boundary.

+7 -2

arch/x86/kvm/mmu.c

··· 899 899 { 900 900 /* 901 901 * Make sure the write to vcpu->mode is not reordered in front of 902 - * reads to sptes. If it does, kvm_commit_zap_page() can see us 902 + * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us 903 903 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. 904 904 */ 905 905 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); ··· 5417 5417 { 5418 5418 MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 5419 5419 5420 - kvm_init_mmu(vcpu, true); 5420 + /* 5421 + * kvm_mmu_setup() is called only on vCPU initialization. 5422 + * Therefore, no need to reset mmu roots as they are not yet 5423 + * initialized. 5424 + */ 5425 + kvm_init_mmu(vcpu, false); 5421 5426 } 5422 5427 5423 5428 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,

+4 -3

arch/x86/kvm/svm.c

··· 1226 1226 min_sev_asid = cpuid_edx(0x8000001F); 1227 1227 1228 1228 /* Initialize SEV ASID bitmap */ 1229 - sev_asid_bitmap = kcalloc(BITS_TO_LONGS(max_sev_asid), 1230 - sizeof(unsigned long), GFP_KERNEL); 1229 + sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); 1231 1230 if (!sev_asid_bitmap) 1232 1231 return 1; 1233 1232 ··· 1404 1405 int cpu; 1405 1406 1406 1407 if (svm_sev_enabled()) 1407 - kfree(sev_asid_bitmap); 1408 + bitmap_free(sev_asid_bitmap); 1408 1409 1409 1410 for_each_possible_cpu(cpu) 1410 1411 svm_cpu_uninit(cpu); ··· 7147 7148 7148 7149 .check_intercept = svm_check_intercept, 7149 7150 .handle_external_intr = svm_handle_external_intr, 7151 + 7152 + .request_immediate_exit = __kvm_request_immediate_exit, 7150 7153 7151 7154 .sched_in = svm_sched_in, 7152 7155

+100 -38

arch/x86/kvm/vmx.c

··· 397 397 int cpu; 398 398 bool launched; 399 399 bool nmi_known_unmasked; 400 + bool hv_timer_armed; 400 401 /* Support for vnmi-less CPUs */ 401 402 int soft_vnmi_blocked; 402 403 ktime_t entry_time; ··· 1019 1018 /* Dynamic PLE window. */ 1020 1019 int ple_window; 1021 1020 bool ple_window_dirty; 1021 + 1022 + bool req_immediate_exit; 1022 1023 1023 1024 /* Support for PML */ 1024 1025 #define PML_ENTITY_NUM 512 ··· 2866 2863 unsigned long fs_base, gs_base; 2867 2864 u16 fs_sel, gs_sel; 2868 2865 int i; 2866 + 2867 + vmx->req_immediate_exit = false; 2869 2868 2870 2869 if (vmx->loaded_cpu_state) 2871 2870 return; ··· 5398 5393 * To use VMXON (and later other VMX instructions), a guest 5399 5394 * must first be able to turn on cr4.VMXE (see handle_vmon()). 5400 5395 * So basically the check on whether to allow nested VMX 5401 - * is here. 5396 + * is here. We operate under the default treatment of SMM, 5397 + * so VMX cannot be enabled under SMM. 5402 5398 */ 5403 - if (!nested_vmx_allowed(vcpu)) 5399 + if (!nested_vmx_allowed(vcpu) || is_smm(vcpu)) 5404 5400 return 1; 5405 5401 } 5406 5402 ··· 6187 6181 } 6188 6182 6189 6183 nested_mark_vmcs12_pages_dirty(vcpu); 6184 + } 6185 + 6186 + static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) 6187 + { 6188 + struct vcpu_vmx *vmx = to_vmx(vcpu); 6189 + void *vapic_page; 6190 + u32 vppr; 6191 + int rvi; 6192 + 6193 + if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || 6194 + !nested_cpu_has_vid(get_vmcs12(vcpu)) || 6195 + WARN_ON_ONCE(!vmx->nested.virtual_apic_page)) 6196 + return false; 6197 + 6198 + rvi = vmcs_read16(GUEST_INTR_STATUS) & 0xff; 6199 + 6200 + vapic_page = kmap(vmx->nested.virtual_apic_page); 6201 + vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); 6202 + kunmap(vmx->nested.virtual_apic_page); 6203 + 6204 + return ((rvi & 0xf0) > (vppr & 0xf0)); 6190 6205 } 6191 6206 6192 6207 static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, ··· 7993 7966 kvm_x86_ops->enable_log_dirty_pt_masked = NULL; 7994 7967 } 7995 7968 7969 + if (!cpu_has_vmx_preemption_timer()) 7970 + kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; 7971 + 7996 7972 if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { 7997 7973 u64 vmx_msr; 7998 7974 ··· 9238 9208 9239 9209 static int handle_preemption_timer(struct kvm_vcpu *vcpu) 9240 9210 { 9241 - kvm_lapic_expired_hv_timer(vcpu); 9211 + if (!to_vmx(vcpu)->req_immediate_exit) 9212 + kvm_lapic_expired_hv_timer(vcpu); 9242 9213 return 1; 9243 9214 } 9244 9215 ··· 10626 10595 msrs[i].host, false); 10627 10596 } 10628 10597 10629 - static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) 10598 + static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val) 10599 + { 10600 + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val); 10601 + if (!vmx->loaded_vmcs->hv_timer_armed) 10602 + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, 10603 + PIN_BASED_VMX_PREEMPTION_TIMER); 10604 + vmx->loaded_vmcs->hv_timer_armed = true; 10605 + } 10606 + 10607 + static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) 10630 10608 { 10631 10609 struct vcpu_vmx *vmx = to_vmx(vcpu); 10632 10610 u64 tscl; 10633 10611 u32 delta_tsc; 10634 10612 10635 - if (vmx->hv_deadline_tsc == -1) 10613 + if (vmx->req_immediate_exit) { 10614 + vmx_arm_hv_timer(vmx, 0); 10636 10615 return; 10616 + } 10637 10617 10638 - tscl = rdtsc(); 10639 - if (vmx->hv_deadline_tsc > tscl) 10640 - /* sure to be 32 bit only because checked on set_hv_timer */ 10641 - delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> 10642 - cpu_preemption_timer_multi); 10643 - else 10644 - delta_tsc = 0; 10618 + if (vmx->hv_deadline_tsc != -1) { 10619 + tscl = rdtsc(); 10620 + if (vmx->hv_deadline_tsc > tscl) 10621 + /* set_hv_timer ensures the delta fits in 32-bits */ 10622 + delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> 10623 + cpu_preemption_timer_multi); 10624 + else 10625 + delta_tsc = 0; 10645 10626 10646 - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); 10627 + vmx_arm_hv_timer(vmx, delta_tsc); 10628 + return; 10629 + } 10630 + 10631 + if (vmx->loaded_vmcs->hv_timer_armed) 10632 + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, 10633 + PIN_BASED_VMX_PREEMPTION_TIMER); 10634 + vmx->loaded_vmcs->hv_timer_armed = false; 10647 10635 } 10648 10636 10649 10637 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) ··· 10722 10672 10723 10673 atomic_switch_perf_msrs(vmx); 10724 10674 10725 - vmx_arm_hv_timer(vcpu); 10675 + vmx_update_hv_timer(vcpu); 10726 10676 10727 10677 /* 10728 10678 * If this vCPU has touched SPEC_CTRL, restore the guest's value if ··· 11477 11427 u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; 11478 11428 struct vcpu_vmx *vmx = to_vmx(vcpu); 11479 11429 11480 - if (vcpu->arch.virtual_tsc_khz == 0) 11481 - return; 11482 - 11483 - /* Make sure short timeouts reliably trigger an immediate vmexit. 11484 - * hrtimer_start does not guarantee this. */ 11485 - if (preemption_timeout <= 1) { 11430 + /* 11431 + * A timer value of zero is architecturally guaranteed to cause 11432 + * a VMExit prior to executing any instructions in the guest. 11433 + */ 11434 + if (preemption_timeout == 0) { 11486 11435 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 11487 11436 return; 11488 11437 } 11438 + 11439 + if (vcpu->arch.virtual_tsc_khz == 0) 11440 + return; 11489 11441 11490 11442 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 11491 11443 preemption_timeout *= 1000000; ··· 11698 11646 * bits 15:8 should be zero in posted_intr_nv, 11699 11647 * the descriptor address has been already checked 11700 11648 * in nested_get_vmcs12_pages. 11649 + * 11650 + * bits 5:0 of posted_intr_desc_addr should be zero. 11701 11651 */ 11702 11652 if (nested_cpu_has_posted_intr(vmcs12) && 11703 11653 (!nested_cpu_has_vid(vmcs12) || 11704 11654 !nested_exit_intr_ack_set(vcpu) || 11705 - vmcs12->posted_intr_nv & 0xff00)) 11655 + (vmcs12->posted_intr_nv & 0xff00) || 11656 + (vmcs12->posted_intr_desc_addr & 0x3f) || 11657 + (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr)))) 11706 11658 return -EINVAL; 11707 11659 11708 11660 /* tpr shadow is needed by all apicv features. */ ··· 12132 12076 12133 12077 exec_control = vmcs12->pin_based_vm_exec_control; 12134 12078 12135 - /* Preemption timer setting is only taken from vmcs01. */ 12136 - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 12079 + /* Preemption timer setting is computed directly in vmx_vcpu_run. */ 12137 12080 exec_control |= vmcs_config.pin_based_exec_ctrl; 12138 - if (vmx->hv_deadline_tsc == -1) 12139 - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 12081 + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 12082 + vmx->loaded_vmcs->hv_timer_armed = false; 12140 12083 12141 12084 /* Posted interrupts setting is only taken from vmcs12. */ 12142 12085 if (nested_cpu_has_posted_intr(vmcs12)) { ··· 12371 12316 12372 12317 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 12373 12318 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) 12319 + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 12320 + 12321 + if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id) 12374 12322 return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 12375 12323 12376 12324 if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12)) ··· 12921 12863 return 0; 12922 12864 } 12923 12865 12866 + static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) 12867 + { 12868 + to_vmx(vcpu)->req_immediate_exit = true; 12869 + } 12870 + 12924 12871 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 12925 12872 { 12926 12873 ktime_t remaining = ··· 13316 13253 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 13317 13254 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 13318 13255 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 13319 - if (vmx->hv_deadline_tsc == -1) 13320 - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, 13321 - PIN_BASED_VMX_PREEMPTION_TIMER); 13322 - else 13323 - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, 13324 - PIN_BASED_VMX_PREEMPTION_TIMER); 13256 + 13325 13257 if (kvm_has_tsc_control) 13326 13258 decache_tsc_multiplier(vmx); 13327 13259 ··· 13520 13462 return -ERANGE; 13521 13463 13522 13464 vmx->hv_deadline_tsc = tscl + delta_tsc; 13523 - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, 13524 - PIN_BASED_VMX_PREEMPTION_TIMER); 13525 - 13526 13465 return delta_tsc == 0; 13527 13466 } 13528 13467 13529 13468 static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) 13530 13469 { 13531 - struct vcpu_vmx *vmx = to_vmx(vcpu); 13532 - vmx->hv_deadline_tsc = -1; 13533 - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, 13534 - PIN_BASED_VMX_PREEMPTION_TIMER); 13470 + to_vmx(vcpu)->hv_deadline_tsc = -1; 13535 13471 } 13536 13472 #endif 13537 13473 ··· 14006 13954 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 14007 13955 return -EINVAL; 14008 13956 13957 + /* 13958 + * SMM temporarily disables VMX, so we cannot be in guest mode, 13959 + * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 13960 + * must be zero. 13961 + */ 13962 + if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags) 13963 + return -EINVAL; 13964 + 14009 13965 if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 14010 13966 !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 14011 13967 return -EINVAL; ··· 14157 14097 .apicv_post_state_restore = vmx_apicv_post_state_restore, 14158 14098 .hwapic_irr_update = vmx_hwapic_irr_update, 14159 14099 .hwapic_isr_update = vmx_hwapic_isr_update, 14100 + .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, 14160 14101 .sync_pir_to_irr = vmx_sync_pir_to_irr, 14161 14102 .deliver_posted_interrupt = vmx_deliver_posted_interrupt, 14162 14103 ··· 14191 14130 .umip_emulated = vmx_umip_emulated, 14192 14131 14193 14132 .check_nested_events = vmx_check_nested_events, 14133 + .request_immediate_exit = vmx_request_immediate_exit, 14194 14134 14195 14135 .sched_in = vmx_sched_in, 14196 14136

+65 -36

arch/x86/kvm/x86.c

··· 628 628 gfn_t gfn; 629 629 int r; 630 630 631 - if (is_long_mode(vcpu) || !is_pae(vcpu)) 631 + if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu)) 632 632 return false; 633 633 634 634 if (!test_bit(VCPU_EXREG_PDPTR, ··· 2537 2537 break; 2538 2538 case MSR_PLATFORM_INFO: 2539 2539 if (!msr_info->host_initiated || 2540 - data & ~MSR_PLATFORM_INFO_CPUID_FAULT || 2541 2540 (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) && 2542 2541 cpuid_fault_enabled(vcpu))) 2543 2542 return 1; ··· 2779 2780 msr_info->data = vcpu->arch.osvw.status; 2780 2781 break; 2781 2782 case MSR_PLATFORM_INFO: 2783 + if (!msr_info->host_initiated && 2784 + !vcpu->kvm->arch.guest_can_read_msr_platform_info) 2785 + return 1; 2782 2786 msr_info->data = vcpu->arch.msr_platform_info; 2783 2787 break; 2784 2788 case MSR_MISC_FEATURES_ENABLES: ··· 2929 2927 case KVM_CAP_SPLIT_IRQCHIP: 2930 2928 case KVM_CAP_IMMEDIATE_EXIT: 2931 2929 case KVM_CAP_GET_MSR_FEATURES: 2930 + case KVM_CAP_MSR_PLATFORM_INFO: 2932 2931 r = 1; 2933 2932 break; 2934 2933 case KVM_CAP_SYNC_REGS: ··· 4010 4007 break; 4011 4008 4012 4009 BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); 4010 + r = -EFAULT; 4013 4011 if (get_user(user_data_size, &user_kvm_nested_state->size)) 4014 - return -EFAULT; 4012 + break; 4015 4013 4016 4014 r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state, 4017 4015 user_data_size); 4018 4016 if (r < 0) 4019 - return r; 4017 + break; 4020 4018 4021 4019 if (r > user_data_size) { 4022 4020 if (put_user(r, &user_kvm_nested_state->size)) 4023 - return -EFAULT; 4024 - return -E2BIG; 4021 + r = -EFAULT; 4022 + else 4023 + r = -E2BIG; 4024 + break; 4025 4025 } 4026 + 4026 4027 r = 0; 4027 4028 break; 4028 4029 } ··· 4038 4031 if (!kvm_x86_ops->set_nested_state) 4039 4032 break; 4040 4033 4034 + r = -EFAULT; 4041 4035 if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state))) 4042 - return -EFAULT; 4036 + break; 4043 4037 4038 + r = -EINVAL; 4044 4039 if (kvm_state.size < sizeof(kvm_state)) 4045 - return -EINVAL; 4040 + break; 4046 4041 4047 4042 if (kvm_state.flags & 4048 4043 ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE)) 4049 - return -EINVAL; 4044 + break; 4050 4045 4051 4046 /* nested_run_pending implies guest_mode. */ 4052 4047 if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING) 4053 - return -EINVAL; 4048 + break; 4054 4049 4055 4050 r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); 4056 4051 break; ··· 4357 4348 kvm->arch.hlt_in_guest = true; 4358 4349 if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) 4359 4350 kvm->arch.pause_in_guest = true; 4351 + r = 0; 4352 + break; 4353 + case KVM_CAP_MSR_PLATFORM_INFO: 4354 + kvm->arch.guest_can_read_msr_platform_info = cap->args[0]; 4360 4355 r = 0; 4361 4356 break; 4362 4357 default: ··· 7374 7361 } 7375 7362 EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page); 7376 7363 7364 + void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) 7365 + { 7366 + smp_send_reschedule(vcpu->cpu); 7367 + } 7368 + EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit); 7369 + 7377 7370 /* 7378 7371 * Returns 1 to let vcpu_run() continue the guest execution loop without 7379 7372 * exiting to the userspace. Otherwise, the value will be returned to the ··· 7584 7565 7585 7566 if (req_immediate_exit) { 7586 7567 kvm_make_request(KVM_REQ_EVENT, vcpu); 7587 - smp_send_reschedule(vcpu->cpu); 7568 + kvm_x86_ops->request_immediate_exit(vcpu); 7588 7569 } 7589 7570 7590 7571 trace_kvm_entry(vcpu->vcpu_id); ··· 7846 7827 run->mmio.is_write = vcpu->mmio_is_write; 7847 7828 vcpu->arch.complete_userspace_io = complete_emulated_mmio; 7848 7829 return 0; 7830 + } 7831 + 7832 + /* Swap (qemu) user FPU context for the guest FPU context. */ 7833 + static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 7834 + { 7835 + preempt_disable(); 7836 + copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); 7837 + /* PKRU is separately restored in kvm_x86_ops->run. */ 7838 + __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, 7839 + ~XFEATURE_MASK_PKRU); 7840 + preempt_enable(); 7841 + trace_kvm_fpu(1); 7842 + } 7843 + 7844 + /* When vcpu_run ends, restore user space FPU context. */ 7845 + static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 7846 + { 7847 + preempt_disable(); 7848 + copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); 7849 + copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); 7850 + preempt_enable(); 7851 + ++vcpu->stat.fpu_reload; 7852 + trace_kvm_fpu(0); 7849 7853 } 7850 7854 7851 7855 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ··· 8219 8177 kvm_update_cpuid(vcpu); 8220 8178 8221 8179 idx = srcu_read_lock(&vcpu->kvm->srcu); 8222 - if (!is_long_mode(vcpu) && is_pae(vcpu)) { 8180 + if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) { 8223 8181 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); 8224 8182 mmu_reset_needed = 1; 8225 8183 } ··· 8446 8404 vcpu->arch.xcr0 = XFEATURE_MASK_FP; 8447 8405 8448 8406 vcpu->arch.cr0 |= X86_CR0_ET; 8449 - } 8450 - 8451 - /* Swap (qemu) user FPU context for the guest FPU context. */ 8452 - void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 8453 - { 8454 - preempt_disable(); 8455 - copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); 8456 - /* PKRU is separately restored in kvm_x86_ops->run. */ 8457 - __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, 8458 - ~XFEATURE_MASK_PKRU); 8459 - preempt_enable(); 8460 - trace_kvm_fpu(1); 8461 - } 8462 - 8463 - /* When vcpu_run ends, restore user space FPU context. */ 8464 - void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 8465 - { 8466 - preempt_disable(); 8467 - copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); 8468 - copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); 8469 - preempt_enable(); 8470 - ++vcpu->stat.fpu_reload; 8471 - trace_kvm_fpu(0); 8472 8407 } 8473 8408 8474 8409 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) ··· 8871 8852 kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); 8872 8853 pvclock_update_vm_gtod_copy(kvm); 8873 8854 8855 + kvm->arch.guest_can_read_msr_platform_info = true; 8856 + 8874 8857 INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); 8875 8858 INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); 8876 8859 ··· 9221 9200 kvm_page_track_flush_slot(kvm, slot); 9222 9201 } 9223 9202 9203 + static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) 9204 + { 9205 + return (is_guest_mode(vcpu) && 9206 + kvm_x86_ops->guest_apic_has_interrupt && 9207 + kvm_x86_ops->guest_apic_has_interrupt(vcpu)); 9208 + } 9209 + 9224 9210 static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) 9225 9211 { 9226 9212 if (!list_empty_careful(&vcpu->async_pf.done)) ··· 9252 9224 return true; 9253 9225 9254 9226 if (kvm_arch_interrupt_allowed(vcpu) && 9255 - kvm_cpu_has_interrupt(vcpu)) 9227 + (kvm_cpu_has_interrupt(vcpu) || 9228 + kvm_guest_apic_has_interrupt(vcpu))) 9256 9229 return true; 9257 9230 9258 9231 if (kvm_hv_has_stimer_pending(vcpu))

-2

include/linux/kvm_host.h

··· 733 733 void kvm_vcpu_kick(struct kvm_vcpu *vcpu); 734 734 int kvm_vcpu_yield_to(struct kvm_vcpu *target); 735 735 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool usermode_vcpu_not_eligible); 736 - void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); 737 - void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); 738 736 739 737 void kvm_flush_remote_tlbs(struct kvm *kvm); 740 738 void kvm_reload_remote_mmus(struct kvm *kvm);

+1

include/uapi/linux/kvm.h

··· 952 952 #define KVM_CAP_S390_HPAGE_1M 156 953 953 #define KVM_CAP_NESTED_STATE 157 954 954 #define KVM_CAP_ARM_INJECT_SERROR_ESR 158 955 + #define KVM_CAP_MSR_PLATFORM_INFO 159 955 956 956 957 #ifdef KVM_CAP_IRQ_ROUTING 957 958

+1

tools/testing/selftests/kvm/.gitignore

··· 1 1 cr4_cpuid_sync_test 2 + platform_info_test 2 3 set_sregs_test 3 4 sync_regs_test 4 5 vmx_tsc_adjust_test

+3 -2

tools/testing/selftests/kvm/Makefile

··· 6 6 LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c 7 7 LIBKVM_x86_64 = lib/x86.c lib/vmx.c 8 8 9 - TEST_GEN_PROGS_x86_64 = set_sregs_test 9 + TEST_GEN_PROGS_x86_64 = platform_info_test 10 + TEST_GEN_PROGS_x86_64 += set_sregs_test 10 11 TEST_GEN_PROGS_x86_64 += sync_regs_test 11 12 TEST_GEN_PROGS_x86_64 += vmx_tsc_adjust_test 12 13 TEST_GEN_PROGS_x86_64 += cr4_cpuid_sync_test ··· 21 20 LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/ 22 21 LINUX_TOOL_INCLUDE = $(top_srcdir)tools/include 23 22 CFLAGS += -O2 -g -std=gnu99 -I$(LINUX_TOOL_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -I.. 24 - LDFLAGS += -lpthread 23 + LDFLAGS += -pthread 25 24 26 25 # After inclusion, $(OUTPUT) is defined and 27 26 # $(TEST_GEN_PROGS) starts with $(OUTPUT)/

+4

tools/testing/selftests/kvm/include/kvm_util.h

··· 50 50 }; 51 51 52 52 int kvm_check_cap(long cap); 53 + int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); 53 54 54 55 struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); 55 56 void kvm_vm_free(struct kvm_vm *vmp); ··· 109 108 struct kvm_vcpu_events *events); 110 109 void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, 111 110 struct kvm_vcpu_events *events); 111 + uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index); 112 + void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, 113 + uint64_t msr_value); 112 114 113 115 const char *exit_reason_str(unsigned int exit_reason); 114 116

+89

tools/testing/selftests/kvm/lib/kvm_util.c

··· 63 63 return ret; 64 64 } 65 65 66 + /* VM Enable Capability 67 + * 68 + * Input Args: 69 + * vm - Virtual Machine 70 + * cap - Capability 71 + * 72 + * Output Args: None 73 + * 74 + * Return: On success, 0. On failure a TEST_ASSERT failure is produced. 75 + * 76 + * Enables a capability (KVM_CAP_*) on the VM. 77 + */ 78 + int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) 79 + { 80 + int ret; 81 + 82 + ret = ioctl(vm->fd, KVM_ENABLE_CAP, cap); 83 + TEST_ASSERT(ret == 0, "KVM_ENABLE_CAP IOCTL failed,\n" 84 + " rc: %i errno: %i", ret, errno); 85 + 86 + return ret; 87 + } 88 + 66 89 static void vm_open(struct kvm_vm *vm, int perm) 67 90 { 68 91 vm->kvm_fd = open(KVM_DEV_PATH, perm); ··· 1241 1218 ret = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, events); 1242 1219 TEST_ASSERT(ret == 0, "KVM_SET_VCPU_EVENTS, failed, rc: %i errno: %i", 1243 1220 ret, errno); 1221 + } 1222 + 1223 + /* VCPU Get MSR 1224 + * 1225 + * Input Args: 1226 + * vm - Virtual Machine 1227 + * vcpuid - VCPU ID 1228 + * msr_index - Index of MSR 1229 + * 1230 + * Output Args: None 1231 + * 1232 + * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced. 1233 + * 1234 + * Get value of MSR for VCPU. 1235 + */ 1236 + uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index) 1237 + { 1238 + struct vcpu *vcpu = vcpu_find(vm, vcpuid); 1239 + struct { 1240 + struct kvm_msrs header; 1241 + struct kvm_msr_entry entry; 1242 + } buffer = {}; 1243 + int r; 1244 + 1245 + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); 1246 + buffer.header.nmsrs = 1; 1247 + buffer.entry.index = msr_index; 1248 + r = ioctl(vcpu->fd, KVM_GET_MSRS, &buffer.header); 1249 + TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n" 1250 + " rc: %i errno: %i", r, errno); 1251 + 1252 + return buffer.entry.data; 1253 + } 1254 + 1255 + /* VCPU Set MSR 1256 + * 1257 + * Input Args: 1258 + * vm - Virtual Machine 1259 + * vcpuid - VCPU ID 1260 + * msr_index - Index of MSR 1261 + * msr_value - New value of MSR 1262 + * 1263 + * Output Args: None 1264 + * 1265 + * Return: On success, nothing. On failure a TEST_ASSERT is produced. 1266 + * 1267 + * Set value of MSR for VCPU. 1268 + */ 1269 + void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, 1270 + uint64_t msr_value) 1271 + { 1272 + struct vcpu *vcpu = vcpu_find(vm, vcpuid); 1273 + struct { 1274 + struct kvm_msrs header; 1275 + struct kvm_msr_entry entry; 1276 + } buffer = {}; 1277 + int r; 1278 + 1279 + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); 1280 + memset(&buffer, 0, sizeof(buffer)); 1281 + buffer.header.nmsrs = 1; 1282 + buffer.entry.index = msr_index; 1283 + buffer.entry.data = msr_value; 1284 + r = ioctl(vcpu->fd, KVM_SET_MSRS, &buffer.header); 1285 + TEST_ASSERT(r == 1, "KVM_SET_MSRS IOCTL failed,\n" 1286 + " rc: %i errno: %i", r, errno); 1244 1287 } 1245 1288 1246 1289 /* VM VCPU Args Set

+110

tools/testing/selftests/kvm/platform_info_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Test for x86 KVM_CAP_MSR_PLATFORM_INFO 4 + * 5 + * Copyright (C) 2018, Google LLC. 6 + * 7 + * This work is licensed under the terms of the GNU GPL, version 2. 8 + * 9 + * Verifies expected behavior of controlling guest access to 10 + * MSR_PLATFORM_INFO. 11 + */ 12 + 13 + #define _GNU_SOURCE /* for program_invocation_short_name */ 14 + #include <fcntl.h> 15 + #include <stdio.h> 16 + #include <stdlib.h> 17 + #include <string.h> 18 + #include <sys/ioctl.h> 19 + 20 + #include "test_util.h" 21 + #include "kvm_util.h" 22 + #include "x86.h" 23 + 24 + #define VCPU_ID 0 25 + #define MSR_PLATFORM_INFO_MAX_TURBO_RATIO 0xff00 26 + 27 + static void guest_code(void) 28 + { 29 + uint64_t msr_platform_info; 30 + 31 + for (;;) { 32 + msr_platform_info = rdmsr(MSR_PLATFORM_INFO); 33 + GUEST_SYNC(msr_platform_info); 34 + asm volatile ("inc %r11"); 35 + } 36 + } 37 + 38 + static void set_msr_platform_info_enabled(struct kvm_vm *vm, bool enable) 39 + { 40 + struct kvm_enable_cap cap = {}; 41 + 42 + cap.cap = KVM_CAP_MSR_PLATFORM_INFO; 43 + cap.flags = 0; 44 + cap.args[0] = (int)enable; 45 + vm_enable_cap(vm, &cap); 46 + } 47 + 48 + static void test_msr_platform_info_enabled(struct kvm_vm *vm) 49 + { 50 + struct kvm_run *run = vcpu_state(vm, VCPU_ID); 51 + struct guest_args args; 52 + 53 + set_msr_platform_info_enabled(vm, true); 54 + vcpu_run(vm, VCPU_ID); 55 + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, 56 + "Exit_reason other than KVM_EXIT_IO: %u (%s),\n", 57 + run->exit_reason, 58 + exit_reason_str(run->exit_reason)); 59 + guest_args_read(vm, VCPU_ID, &args); 60 + TEST_ASSERT(args.port == GUEST_PORT_SYNC, 61 + "Received IO from port other than PORT_HOST_SYNC: %u\n", 62 + run->io.port); 63 + TEST_ASSERT((args.arg1 & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) == 64 + MSR_PLATFORM_INFO_MAX_TURBO_RATIO, 65 + "Expected MSR_PLATFORM_INFO to have max turbo ratio mask: %i.", 66 + MSR_PLATFORM_INFO_MAX_TURBO_RATIO); 67 + } 68 + 69 + static void test_msr_platform_info_disabled(struct kvm_vm *vm) 70 + { 71 + struct kvm_run *run = vcpu_state(vm, VCPU_ID); 72 + 73 + set_msr_platform_info_enabled(vm, false); 74 + vcpu_run(vm, VCPU_ID); 75 + TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN, 76 + "Exit_reason other than KVM_EXIT_SHUTDOWN: %u (%s)\n", 77 + run->exit_reason, 78 + exit_reason_str(run->exit_reason)); 79 + } 80 + 81 + int main(int argc, char *argv[]) 82 + { 83 + struct kvm_vm *vm; 84 + struct kvm_run *state; 85 + int rv; 86 + uint64_t msr_platform_info; 87 + 88 + /* Tell stdout not to buffer its content */ 89 + setbuf(stdout, NULL); 90 + 91 + rv = kvm_check_cap(KVM_CAP_MSR_PLATFORM_INFO); 92 + if (!rv) { 93 + fprintf(stderr, 94 + "KVM_CAP_MSR_PLATFORM_INFO not supported, skip test\n"); 95 + exit(KSFT_SKIP); 96 + } 97 + 98 + vm = vm_create_default(VCPU_ID, 0, guest_code); 99 + 100 + msr_platform_info = vcpu_get_msr(vm, VCPU_ID, MSR_PLATFORM_INFO); 101 + vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, 102 + msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO); 103 + test_msr_platform_info_disabled(vm); 104 + test_msr_platform_info_enabled(vm); 105 + vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, msr_platform_info); 106 + 107 + kvm_vm_free(vm); 108 + 109 + return 0; 110 + }

Configure Feed

Configure Feed