Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Paolo writes:
"It's mostly small bugfixes and cleanups, mostly around x86 nested
virtualization. One important change, not related to nested
virtualization, is that the ability for the guest kernel to trap
CPUID instructions (in Linux that's the ARCH_SET_CPUID arch_prctl) is
now masked by default. This is because the feature is detected
through an MSR; a very bad idea that Intel seems to like more and
more. Some applications choke if the other fields of that MSR are
not initialized as on real hardware, hence we have to disable the
whole MSR by default, as was the case before Linux 4.12."

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (23 commits)
KVM: nVMX: Fix bad cleanup on error of get/set nested state IOCTLs
kvm: selftests: Add platform_info_test
KVM: x86: Control guest reads of MSR_PLATFORM_INFO
KVM: x86: Turbo bits in MSR_PLATFORM_INFO
nVMX x86: Check VPID value on vmentry of L2 guests
nVMX x86: check posted-interrupt descriptor addresss on vmentry of L2
KVM: nVMX: Wake blocked vCPU in guest-mode if pending interrupt in virtual APICv
KVM: VMX: check nested state and CR4.VMXE against SMM
kvm: x86: make kvm_{load|put}_guest_fpu() static
x86/hyper-v: rename ipi_arg_{ex,non_ex} structures
KVM: VMX: use preemption timer to force immediate VMExit
KVM: VMX: modify preemption timer bit only when arming timer
KVM: VMX: immediately mark preemption timer expired only for zero value
KVM: SVM: Switch to bitmap_zalloc()
KVM/MMU: Fix comment in walk_shadow_page_lockless_end()
kvm: selftests: use -pthread instead of -lpthread
KVM: x86: don't reset root in kvm_mmu_setup()
kvm: mmu: Don't read PDPTEs when paging is not enabled
x86/kvm/lapic: always disable MMIO interface in x2APIC mode
KVM: s390: Make huge pages unavailable in ucontrol VMs
...

+537 -244
+11 -1
Documentation/virtual/kvm/api.txt
··· 4510 4510 Architectures: s390 4511 4511 Parameters: none 4512 4512 Returns: 0 on success, -EINVAL if hpage module parameter was not set 4513 - or cmma is enabled 4513 + or cmma is enabled, or the VM has the KVM_VM_S390_UCONTROL 4514 + flag set 4514 4515 4515 4516 With this capability the KVM support for memory backing with 1m pages 4516 4517 through hugetlbfs can be enabled for a VM. After the capability is ··· 4521 4520 4522 4521 While it is generally possible to create a huge page backed VM without 4523 4522 this capability, the VM will not be able to run. 4523 + 4524 + 7.14 KVM_CAP_MSR_PLATFORM_INFO 4525 + 4526 + Architectures: x86 4527 + Parameters: args[0] whether feature should be enabled or not 4528 + 4529 + With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise, 4530 + a #GP would be raised when the guest tries to access. Currently, this 4531 + capability does not enable write permissions of this MSR for the guest. 4524 4532 4525 4533 8. Other capabilities. 4526 4534 ----------------------
-1
arch/powerpc/include/asm/book3s/64/pgtable.h
··· 1051 1051 return hash__vmemmap_remove_mapping(start, page_size); 1052 1052 } 1053 1053 #endif 1054 - struct page *realmode_pfn_to_page(unsigned long pfn); 1055 1054 1056 1055 static inline pte_t pmd_pte(pmd_t pmd) 1057 1056 {
-2
arch/powerpc/include/asm/iommu.h
··· 220 220 extern int __init tce_iommu_bus_notifier_init(void); 221 221 extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, 222 222 unsigned long *hpa, enum dma_data_direction *direction); 223 - extern long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry, 224 - unsigned long *hpa, enum dma_data_direction *direction); 225 223 #else 226 224 static inline void iommu_register_group(struct iommu_table_group *table_group, 227 225 int pci_domain_number,
+1
arch/powerpc/include/asm/mmu_context.h
··· 38 38 unsigned long ua, unsigned int pageshift, unsigned long *hpa); 39 39 extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, 40 40 unsigned long ua, unsigned int pageshift, unsigned long *hpa); 41 + extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua); 41 42 extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem); 42 43 extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem); 43 44 #endif
-25
arch/powerpc/kernel/iommu.c
··· 1013 1013 } 1014 1014 EXPORT_SYMBOL_GPL(iommu_tce_xchg); 1015 1015 1016 - #ifdef CONFIG_PPC_BOOK3S_64 1017 - long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry, 1018 - unsigned long *hpa, enum dma_data_direction *direction) 1019 - { 1020 - long ret; 1021 - 1022 - ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); 1023 - 1024 - if (!ret && ((*direction == DMA_FROM_DEVICE) || 1025 - (*direction == DMA_BIDIRECTIONAL))) { 1026 - struct page *pg = realmode_pfn_to_page(*hpa >> PAGE_SHIFT); 1027 - 1028 - if (likely(pg)) { 1029 - SetPageDirty(pg); 1030 - } else { 1031 - tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); 1032 - ret = -EFAULT; 1033 - } 1034 - } 1035 - 1036 - return ret; 1037 - } 1038 - EXPORT_SYMBOL_GPL(iommu_tce_xchg_rm); 1039 - #endif 1040 - 1041 1016 int iommu_take_ownership(struct iommu_table *tbl) 1042 1017 { 1043 1018 unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
+37 -54
arch/powerpc/kvm/book3s_64_mmu_radix.c
··· 525 525 unsigned long ea, unsigned long dsisr) 526 526 { 527 527 struct kvm *kvm = vcpu->kvm; 528 - unsigned long mmu_seq, pte_size; 529 - unsigned long gpa, gfn, hva, pfn; 528 + unsigned long mmu_seq; 529 + unsigned long gpa, gfn, hva; 530 530 struct kvm_memory_slot *memslot; 531 531 struct page *page = NULL; 532 532 long ret; ··· 623 623 */ 624 624 hva = gfn_to_hva_memslot(memslot, gfn); 625 625 if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) { 626 - pfn = page_to_pfn(page); 627 626 upgrade_write = true; 628 627 } else { 628 + unsigned long pfn; 629 + 629 630 /* Call KVM generic code to do the slow-path check */ 630 631 pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, 631 632 writing, upgrade_p); ··· 640 639 } 641 640 } 642 641 643 - /* See if we can insert a 1GB or 2MB large PTE here */ 644 - level = 0; 645 - if (page && PageCompound(page)) { 646 - pte_size = PAGE_SIZE << compound_order(compound_head(page)); 647 - if (pte_size >= PUD_SIZE && 648 - (gpa & (PUD_SIZE - PAGE_SIZE)) == 649 - (hva & (PUD_SIZE - PAGE_SIZE))) { 650 - level = 2; 651 - pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1); 652 - } else if (pte_size >= PMD_SIZE && 653 - (gpa & (PMD_SIZE - PAGE_SIZE)) == 654 - (hva & (PMD_SIZE - PAGE_SIZE))) { 655 - level = 1; 656 - pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); 642 + /* 643 + * Read the PTE from the process' radix tree and use that 644 + * so we get the shift and attribute bits. 645 + */ 646 + local_irq_disable(); 647 + ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); 648 + pte = *ptep; 649 + local_irq_enable(); 650 + 651 + /* Get pte level from shift/size */ 652 + if (shift == PUD_SHIFT && 653 + (gpa & (PUD_SIZE - PAGE_SIZE)) == 654 + (hva & (PUD_SIZE - PAGE_SIZE))) { 655 + level = 2; 656 + } else if (shift == PMD_SHIFT && 657 + (gpa & (PMD_SIZE - PAGE_SIZE)) == 658 + (hva & (PMD_SIZE - PAGE_SIZE))) { 659 + level = 1; 660 + } else { 661 + level = 0; 662 + if (shift > PAGE_SHIFT) { 663 + /* 664 + * If the pte maps more than one page, bring over 665 + * bits from the virtual address to get the real 666 + * address of the specific single page we want. 667 + */ 668 + unsigned long rpnmask = (1ul << shift) - PAGE_SIZE; 669 + pte = __pte(pte_val(pte) | (hva & rpnmask)); 657 670 } 658 671 } 659 672 660 - /* 661 - * Compute the PTE value that we need to insert. 662 - */ 663 - if (page) { 664 - pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE | 665 - _PAGE_ACCESSED; 666 - if (writing || upgrade_write) 667 - pgflags |= _PAGE_WRITE | _PAGE_DIRTY; 668 - pte = pfn_pte(pfn, __pgprot(pgflags)); 673 + pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED); 674 + if (writing || upgrade_write) { 675 + if (pte_val(pte) & _PAGE_WRITE) 676 + pte = __pte(pte_val(pte) | _PAGE_DIRTY); 669 677 } else { 670 - /* 671 - * Read the PTE from the process' radix tree and use that 672 - * so we get the attribute bits. 673 - */ 674 - local_irq_disable(); 675 - ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); 676 - pte = *ptep; 677 - local_irq_enable(); 678 - if (shift == PUD_SHIFT && 679 - (gpa & (PUD_SIZE - PAGE_SIZE)) == 680 - (hva & (PUD_SIZE - PAGE_SIZE))) { 681 - level = 2; 682 - } else if (shift == PMD_SHIFT && 683 - (gpa & (PMD_SIZE - PAGE_SIZE)) == 684 - (hva & (PMD_SIZE - PAGE_SIZE))) { 685 - level = 1; 686 - } else if (shift && shift != PAGE_SHIFT) { 687 - /* Adjust PFN */ 688 - unsigned long mask = (1ul << shift) - PAGE_SIZE; 689 - pte = __pte(pte_val(pte) | (hva & mask)); 690 - } 691 - pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED); 692 - if (writing || upgrade_write) { 693 - if (pte_val(pte) & _PAGE_WRITE) 694 - pte = __pte(pte_val(pte) | _PAGE_DIRTY); 695 - } else { 696 - pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY)); 697 - } 678 + pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY)); 698 679 } 699 680 700 681 /* Allocate space in the tree and write the PTE */
+31 -8
arch/powerpc/kvm/book3s_64_vio_hv.c
··· 187 187 EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); 188 188 189 189 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 190 - static void kvmppc_rm_clear_tce(struct iommu_table *tbl, unsigned long entry) 190 + static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, 191 + unsigned long entry, unsigned long *hpa, 192 + enum dma_data_direction *direction) 193 + { 194 + long ret; 195 + 196 + ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); 197 + 198 + if (!ret && ((*direction == DMA_FROM_DEVICE) || 199 + (*direction == DMA_BIDIRECTIONAL))) { 200 + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); 201 + /* 202 + * kvmppc_rm_tce_iommu_do_map() updates the UA cache after 203 + * calling this so we still get here a valid UA. 204 + */ 205 + if (pua && *pua) 206 + mm_iommu_ua_mark_dirty_rm(mm, be64_to_cpu(*pua)); 207 + } 208 + 209 + return ret; 210 + } 211 + 212 + static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl, 213 + unsigned long entry) 191 214 { 192 215 unsigned long hpa = 0; 193 216 enum dma_data_direction dir = DMA_NONE; 194 217 195 - iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); 218 + iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); 196 219 } 197 220 198 221 static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, ··· 247 224 unsigned long hpa = 0; 248 225 long ret; 249 226 250 - if (iommu_tce_xchg_rm(tbl, entry, &hpa, &dir)) 227 + if (iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir)) 251 228 /* 252 229 * real mode xchg can fail if struct page crosses 253 230 * a page boundary ··· 259 236 260 237 ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); 261 238 if (ret) 262 - iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); 239 + iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); 263 240 264 241 return ret; 265 242 } ··· 305 282 if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) 306 283 return H_CLOSED; 307 284 308 - ret = iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); 285 + ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); 309 286 if (ret) { 310 287 mm_iommu_mapped_dec(mem); 311 288 /* ··· 394 371 return ret; 395 372 396 373 WARN_ON_ONCE_RM(1); 397 - kvmppc_rm_clear_tce(stit->tbl, entry); 374 + kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); 398 375 } 399 376 400 377 kvmppc_tce_put(stt, entry, tce); ··· 543 520 goto unlock_exit; 544 521 545 522 WARN_ON_ONCE_RM(1); 546 - kvmppc_rm_clear_tce(stit->tbl, entry); 523 + kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); 547 524 } 548 525 549 526 kvmppc_tce_put(stt, entry + i, tce); ··· 594 571 return ret; 595 572 596 573 WARN_ON_ONCE_RM(1); 597 - kvmppc_rm_clear_tce(stit->tbl, entry); 574 + kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); 598 575 } 599 576 } 600 577
-49
arch/powerpc/mm/init_64.c
··· 308 308 { 309 309 } 310 310 311 - /* 312 - * We do not have access to the sparsemem vmemmap, so we fallback to 313 - * walking the list of sparsemem blocks which we already maintain for 314 - * the sake of crashdump. In the long run, we might want to maintain 315 - * a tree if performance of that linear walk becomes a problem. 316 - * 317 - * realmode_pfn_to_page functions can fail due to: 318 - * 1) As real sparsemem blocks do not lay in RAM continously (they 319 - * are in virtual address space which is not available in the real mode), 320 - * the requested page struct can be split between blocks so get_page/put_page 321 - * may fail. 322 - * 2) When huge pages are used, the get_page/put_page API will fail 323 - * in real mode as the linked addresses in the page struct are virtual 324 - * too. 325 - */ 326 - struct page *realmode_pfn_to_page(unsigned long pfn) 327 - { 328 - struct vmemmap_backing *vmem_back; 329 - struct page *page; 330 - unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; 331 - unsigned long pg_va = (unsigned long) pfn_to_page(pfn); 332 - 333 - for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back->list) { 334 - if (pg_va < vmem_back->virt_addr) 335 - continue; 336 - 337 - /* After vmemmap_list entry free is possible, need check all */ 338 - if ((pg_va + sizeof(struct page)) <= 339 - (vmem_back->virt_addr + page_size)) { 340 - page = (struct page *) (vmem_back->phys + pg_va - 341 - vmem_back->virt_addr); 342 - return page; 343 - } 344 - } 345 - 346 - /* Probably that page struct is split between real pages */ 347 - return NULL; 348 - } 349 - EXPORT_SYMBOL_GPL(realmode_pfn_to_page); 350 - 351 - #else 352 - 353 - struct page *realmode_pfn_to_page(unsigned long pfn) 354 - { 355 - struct page *page = pfn_to_page(pfn); 356 - return page; 357 - } 358 - EXPORT_SYMBOL_GPL(realmode_pfn_to_page); 359 - 360 311 #endif /* CONFIG_SPARSEMEM_VMEMMAP */ 361 312 362 313 #ifdef CONFIG_PPC_BOOK3S_64
+30 -4
arch/powerpc/mm/mmu_context_iommu.c
··· 18 18 #include <linux/migrate.h> 19 19 #include <linux/hugetlb.h> 20 20 #include <linux/swap.h> 21 + #include <linux/sizes.h> 21 22 #include <asm/mmu_context.h> 22 23 #include <asm/pte-walk.h> 23 24 24 25 static DEFINE_MUTEX(mem_list_mutex); 26 + 27 + #define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY 0x1 28 + #define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1) 25 29 26 30 struct mm_iommu_table_group_mem_t { 27 31 struct list_head next; ··· 267 263 if (!page) 268 264 continue; 269 265 266 + if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY) 267 + SetPageDirty(page); 268 + 270 269 put_page(page); 271 270 mem->hpas[i] = 0; 272 271 } ··· 367 360 368 361 return ret; 369 362 } 370 - EXPORT_SYMBOL_GPL(mm_iommu_lookup_rm); 371 363 372 364 struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, 373 365 unsigned long ua, unsigned long entries) ··· 396 390 if (pageshift > mem->pageshift) 397 391 return -EFAULT; 398 392 399 - *hpa = *va | (ua & ~PAGE_MASK); 393 + *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); 400 394 401 395 return 0; 402 396 } ··· 419 413 if (!pa) 420 414 return -EFAULT; 421 415 422 - *hpa = *pa | (ua & ~PAGE_MASK); 416 + *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); 423 417 424 418 return 0; 425 419 } 426 - EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa_rm); 420 + 421 + extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua) 422 + { 423 + struct mm_iommu_table_group_mem_t *mem; 424 + long entry; 425 + void *va; 426 + unsigned long *pa; 427 + 428 + mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE); 429 + if (!mem) 430 + return; 431 + 432 + entry = (ua - mem->ua) >> PAGE_SHIFT; 433 + va = &mem->hpas[entry]; 434 + 435 + pa = (void *) vmalloc_to_phys(va); 436 + if (!pa) 437 + return; 438 + 439 + *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY; 440 + } 427 441 428 442 long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem) 429 443 {
+2 -2
arch/s390/kvm/kvm-s390.c
··· 481 481 break; 482 482 case KVM_CAP_S390_HPAGE_1M: 483 483 r = 0; 484 - if (hpage) 484 + if (hpage && !kvm_is_ucontrol(kvm)) 485 485 r = 1; 486 486 break; 487 487 case KVM_CAP_S390_MEM_OP: ··· 691 691 mutex_lock(&kvm->lock); 692 692 if (kvm->created_vcpus) 693 693 r = -EBUSY; 694 - else if (!hpage || kvm->arch.use_cmma) 694 + else if (!hpage || kvm->arch.use_cmma || kvm_is_ucontrol(kvm)) 695 695 r = -EINVAL; 696 696 else { 697 697 r = 0;
+3 -1
arch/s390/mm/gmap.c
··· 708 708 vmaddr |= gaddr & ~PMD_MASK; 709 709 /* Find vma in the parent mm */ 710 710 vma = find_vma(gmap->mm, vmaddr); 711 + if (!vma) 712 + continue; 711 713 /* 712 714 * We do not discard pages that are backed by 713 715 * hugetlbfs, so we don't have to refault them. 714 716 */ 715 - if (vma && is_vm_hugetlb_page(vma)) 717 + if (is_vm_hugetlb_page(vma)) 716 718 continue; 717 719 size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); 718 720 zap_page_range(vma, vmaddr, size);
+4 -4
arch/x86/hyperv/hv_apic.c
··· 95 95 */ 96 96 static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) 97 97 { 98 - struct ipi_arg_ex **arg; 99 - struct ipi_arg_ex *ipi_arg; 98 + struct hv_send_ipi_ex **arg; 99 + struct hv_send_ipi_ex *ipi_arg; 100 100 unsigned long flags; 101 101 int nr_bank = 0; 102 102 int ret = 1; ··· 105 105 return false; 106 106 107 107 local_irq_save(flags); 108 - arg = (struct ipi_arg_ex **)this_cpu_ptr(hyperv_pcpu_input_arg); 108 + arg = (struct hv_send_ipi_ex **)this_cpu_ptr(hyperv_pcpu_input_arg); 109 109 110 110 ipi_arg = *arg; 111 111 if (unlikely(!ipi_arg)) ··· 135 135 static bool __send_ipi_mask(const struct cpumask *mask, int vector) 136 136 { 137 137 int cur_cpu, vcpu; 138 - struct ipi_arg_non_ex ipi_arg; 138 + struct hv_send_ipi ipi_arg; 139 139 int ret = 1; 140 140 141 141 trace_hyperv_send_ipi_mask(mask, vector);
+9 -7
arch/x86/include/asm/hyperv-tlfs.h
··· 726 726 #define HV_STIMER_AUTOENABLE (1ULL << 3) 727 727 #define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F) 728 728 729 - struct ipi_arg_non_ex { 730 - u32 vector; 731 - u32 reserved; 732 - u64 cpu_mask; 733 - }; 734 - 735 729 struct hv_vpset { 736 730 u64 format; 737 731 u64 valid_bank_mask; 738 732 u64 bank_contents[]; 739 733 }; 740 734 741 - struct ipi_arg_ex { 735 + /* HvCallSendSyntheticClusterIpi hypercall */ 736 + struct hv_send_ipi { 737 + u32 vector; 738 + u32 reserved; 739 + u64 cpu_mask; 740 + }; 741 + 742 + /* HvCallSendSyntheticClusterIpiEx hypercall */ 743 + struct hv_send_ipi_ex { 742 744 u32 vector; 743 745 u32 reserved; 744 746 struct hv_vpset vp_set;
+5
arch/x86/include/asm/kvm_host.h
··· 869 869 870 870 bool x2apic_format; 871 871 bool x2apic_broadcast_quirk_disabled; 872 + 873 + bool guest_can_read_msr_platform_info; 872 874 }; 873 875 874 876 struct kvm_vm_stat { ··· 1024 1022 void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); 1025 1023 void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); 1026 1024 void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); 1025 + bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); 1027 1026 void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); 1028 1027 void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); 1029 1028 void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); ··· 1058 1055 bool (*umip_emulated)(void); 1059 1056 1060 1057 int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); 1058 + void (*request_immediate_exit)(struct kvm_vcpu *vcpu); 1061 1059 1062 1060 void (*sched_in)(struct kvm_vcpu *kvm, int cpu); 1063 1061 ··· 1486 1482 1487 1483 int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); 1488 1484 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); 1485 + void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); 1489 1486 1490 1487 int kvm_is_in_guest(void); 1491 1488
+1
arch/x86/include/uapi/asm/kvm.h
··· 377 377 378 378 #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) 379 379 #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) 380 + #define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) 380 381 381 382 #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 382 383 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002
+19 -3
arch/x86/kvm/lapic.c
··· 1344 1344 1345 1345 static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) 1346 1346 { 1347 - return kvm_apic_hw_enabled(apic) && 1348 - addr >= apic->base_address && 1349 - addr < apic->base_address + LAPIC_MMIO_LENGTH; 1347 + return addr >= apic->base_address && 1348 + addr < apic->base_address + LAPIC_MMIO_LENGTH; 1350 1349 } 1351 1350 1352 1351 static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, ··· 1356 1357 1357 1358 if (!apic_mmio_in_range(apic, address)) 1358 1359 return -EOPNOTSUPP; 1360 + 1361 + if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) { 1362 + if (!kvm_check_has_quirk(vcpu->kvm, 1363 + KVM_X86_QUIRK_LAPIC_MMIO_HOLE)) 1364 + return -EOPNOTSUPP; 1365 + 1366 + memset(data, 0xff, len); 1367 + return 0; 1368 + } 1359 1369 1360 1370 kvm_lapic_reg_read(apic, offset, len, data); 1361 1371 ··· 1924 1916 1925 1917 if (!apic_mmio_in_range(apic, address)) 1926 1918 return -EOPNOTSUPP; 1919 + 1920 + if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) { 1921 + if (!kvm_check_has_quirk(vcpu->kvm, 1922 + KVM_X86_QUIRK_LAPIC_MMIO_HOLE)) 1923 + return -EOPNOTSUPP; 1924 + 1925 + return 0; 1926 + } 1927 1927 1928 1928 /* 1929 1929 * APIC register must be aligned on 128-bits boundary.
+7 -2
arch/x86/kvm/mmu.c
··· 899 899 { 900 900 /* 901 901 * Make sure the write to vcpu->mode is not reordered in front of 902 - * reads to sptes. If it does, kvm_commit_zap_page() can see us 902 + * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us 903 903 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. 904 904 */ 905 905 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); ··· 5417 5417 { 5418 5418 MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 5419 5419 5420 - kvm_init_mmu(vcpu, true); 5420 + /* 5421 + * kvm_mmu_setup() is called only on vCPU initialization. 5422 + * Therefore, no need to reset mmu roots as they are not yet 5423 + * initialized. 5424 + */ 5425 + kvm_init_mmu(vcpu, false); 5421 5426 } 5422 5427 5423 5428 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
+4 -3
arch/x86/kvm/svm.c
··· 1226 1226 min_sev_asid = cpuid_edx(0x8000001F); 1227 1227 1228 1228 /* Initialize SEV ASID bitmap */ 1229 - sev_asid_bitmap = kcalloc(BITS_TO_LONGS(max_sev_asid), 1230 - sizeof(unsigned long), GFP_KERNEL); 1229 + sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); 1231 1230 if (!sev_asid_bitmap) 1232 1231 return 1; 1233 1232 ··· 1404 1405 int cpu; 1405 1406 1406 1407 if (svm_sev_enabled()) 1407 - kfree(sev_asid_bitmap); 1408 + bitmap_free(sev_asid_bitmap); 1408 1409 1409 1410 for_each_possible_cpu(cpu) 1410 1411 svm_cpu_uninit(cpu); ··· 7147 7148 7148 7149 .check_intercept = svm_check_intercept, 7149 7150 .handle_external_intr = svm_handle_external_intr, 7151 + 7152 + .request_immediate_exit = __kvm_request_immediate_exit, 7150 7153 7151 7154 .sched_in = svm_sched_in, 7152 7155
+100 -38
arch/x86/kvm/vmx.c
··· 397 397 int cpu; 398 398 bool launched; 399 399 bool nmi_known_unmasked; 400 + bool hv_timer_armed; 400 401 /* Support for vnmi-less CPUs */ 401 402 int soft_vnmi_blocked; 402 403 ktime_t entry_time; ··· 1019 1018 /* Dynamic PLE window. */ 1020 1019 int ple_window; 1021 1020 bool ple_window_dirty; 1021 + 1022 + bool req_immediate_exit; 1022 1023 1023 1024 /* Support for PML */ 1024 1025 #define PML_ENTITY_NUM 512 ··· 2866 2863 unsigned long fs_base, gs_base; 2867 2864 u16 fs_sel, gs_sel; 2868 2865 int i; 2866 + 2867 + vmx->req_immediate_exit = false; 2869 2868 2870 2869 if (vmx->loaded_cpu_state) 2871 2870 return; ··· 5398 5393 * To use VMXON (and later other VMX instructions), a guest 5399 5394 * must first be able to turn on cr4.VMXE (see handle_vmon()). 5400 5395 * So basically the check on whether to allow nested VMX 5401 - * is here. 5396 + * is here. We operate under the default treatment of SMM, 5397 + * so VMX cannot be enabled under SMM. 5402 5398 */ 5403 - if (!nested_vmx_allowed(vcpu)) 5399 + if (!nested_vmx_allowed(vcpu) || is_smm(vcpu)) 5404 5400 return 1; 5405 5401 } 5406 5402 ··· 6187 6181 } 6188 6182 6189 6183 nested_mark_vmcs12_pages_dirty(vcpu); 6184 + } 6185 + 6186 + static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) 6187 + { 6188 + struct vcpu_vmx *vmx = to_vmx(vcpu); 6189 + void *vapic_page; 6190 + u32 vppr; 6191 + int rvi; 6192 + 6193 + if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || 6194 + !nested_cpu_has_vid(get_vmcs12(vcpu)) || 6195 + WARN_ON_ONCE(!vmx->nested.virtual_apic_page)) 6196 + return false; 6197 + 6198 + rvi = vmcs_read16(GUEST_INTR_STATUS) & 0xff; 6199 + 6200 + vapic_page = kmap(vmx->nested.virtual_apic_page); 6201 + vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); 6202 + kunmap(vmx->nested.virtual_apic_page); 6203 + 6204 + return ((rvi & 0xf0) > (vppr & 0xf0)); 6190 6205 } 6191 6206 6192 6207 static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, ··· 7993 7966 kvm_x86_ops->enable_log_dirty_pt_masked = NULL; 7994 7967 } 7995 7968 7969 + if (!cpu_has_vmx_preemption_timer()) 7970 + kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; 7971 + 7996 7972 if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { 7997 7973 u64 vmx_msr; 7998 7974 ··· 9238 9208 9239 9209 static int handle_preemption_timer(struct kvm_vcpu *vcpu) 9240 9210 { 9241 - kvm_lapic_expired_hv_timer(vcpu); 9211 + if (!to_vmx(vcpu)->req_immediate_exit) 9212 + kvm_lapic_expired_hv_timer(vcpu); 9242 9213 return 1; 9243 9214 } 9244 9215 ··· 10626 10595 msrs[i].host, false); 10627 10596 } 10628 10597 10629 - static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) 10598 + static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val) 10599 + { 10600 + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val); 10601 + if (!vmx->loaded_vmcs->hv_timer_armed) 10602 + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, 10603 + PIN_BASED_VMX_PREEMPTION_TIMER); 10604 + vmx->loaded_vmcs->hv_timer_armed = true; 10605 + } 10606 + 10607 + static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) 10630 10608 { 10631 10609 struct vcpu_vmx *vmx = to_vmx(vcpu); 10632 10610 u64 tscl; 10633 10611 u32 delta_tsc; 10634 10612 10635 - if (vmx->hv_deadline_tsc == -1) 10613 + if (vmx->req_immediate_exit) { 10614 + vmx_arm_hv_timer(vmx, 0); 10636 10615 return; 10616 + } 10637 10617 10638 - tscl = rdtsc(); 10639 - if (vmx->hv_deadline_tsc > tscl) 10640 - /* sure to be 32 bit only because checked on set_hv_timer */ 10641 - delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> 10642 - cpu_preemption_timer_multi); 10643 - else 10644 - delta_tsc = 0; 10618 + if (vmx->hv_deadline_tsc != -1) { 10619 + tscl = rdtsc(); 10620 + if (vmx->hv_deadline_tsc > tscl) 10621 + /* set_hv_timer ensures the delta fits in 32-bits */ 10622 + delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> 10623 + cpu_preemption_timer_multi); 10624 + else 10625 + delta_tsc = 0; 10645 10626 10646 - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); 10627 + vmx_arm_hv_timer(vmx, delta_tsc); 10628 + return; 10629 + } 10630 + 10631 + if (vmx->loaded_vmcs->hv_timer_armed) 10632 + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, 10633 + PIN_BASED_VMX_PREEMPTION_TIMER); 10634 + vmx->loaded_vmcs->hv_timer_armed = false; 10647 10635 } 10648 10636 10649 10637 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) ··· 10722 10672 10723 10673 atomic_switch_perf_msrs(vmx); 10724 10674 10725 - vmx_arm_hv_timer(vcpu); 10675 + vmx_update_hv_timer(vcpu); 10726 10676 10727 10677 /* 10728 10678 * If this vCPU has touched SPEC_CTRL, restore the guest's value if ··· 11477 11427 u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; 11478 11428 struct vcpu_vmx *vmx = to_vmx(vcpu); 11479 11429 11480 - if (vcpu->arch.virtual_tsc_khz == 0) 11481 - return; 11482 - 11483 - /* Make sure short timeouts reliably trigger an immediate vmexit. 11484 - * hrtimer_start does not guarantee this. */ 11485 - if (preemption_timeout <= 1) { 11430 + /* 11431 + * A timer value of zero is architecturally guaranteed to cause 11432 + * a VMExit prior to executing any instructions in the guest. 11433 + */ 11434 + if (preemption_timeout == 0) { 11486 11435 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 11487 11436 return; 11488 11437 } 11438 + 11439 + if (vcpu->arch.virtual_tsc_khz == 0) 11440 + return; 11489 11441 11490 11442 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 11491 11443 preemption_timeout *= 1000000; ··· 11698 11646 * bits 15:8 should be zero in posted_intr_nv, 11699 11647 * the descriptor address has been already checked 11700 11648 * in nested_get_vmcs12_pages. 11649 + * 11650 + * bits 5:0 of posted_intr_desc_addr should be zero. 11701 11651 */ 11702 11652 if (nested_cpu_has_posted_intr(vmcs12) && 11703 11653 (!nested_cpu_has_vid(vmcs12) || 11704 11654 !nested_exit_intr_ack_set(vcpu) || 11705 - vmcs12->posted_intr_nv & 0xff00)) 11655 + (vmcs12->posted_intr_nv & 0xff00) || 11656 + (vmcs12->posted_intr_desc_addr & 0x3f) || 11657 + (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr)))) 11706 11658 return -EINVAL; 11707 11659 11708 11660 /* tpr shadow is needed by all apicv features. */ ··· 12132 12076 12133 12077 exec_control = vmcs12->pin_based_vm_exec_control; 12134 12078 12135 - /* Preemption timer setting is only taken from vmcs01. */ 12136 - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 12079 + /* Preemption timer setting is computed directly in vmx_vcpu_run. */ 12137 12080 exec_control |= vmcs_config.pin_based_exec_ctrl; 12138 - if (vmx->hv_deadline_tsc == -1) 12139 - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 12081 + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 12082 + vmx->loaded_vmcs->hv_timer_armed = false; 12140 12083 12141 12084 /* Posted interrupts setting is only taken from vmcs12. */ 12142 12085 if (nested_cpu_has_posted_intr(vmcs12)) { ··· 12371 12316 12372 12317 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 12373 12318 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) 12319 + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 12320 + 12321 + if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id) 12374 12322 return VMXERR_ENTRY_INVALID_CONTROL_FIELD; 12375 12323 12376 12324 if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12)) ··· 12921 12863 return 0; 12922 12864 } 12923 12865 12866 + static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) 12867 + { 12868 + to_vmx(vcpu)->req_immediate_exit = true; 12869 + } 12870 + 12924 12871 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 12925 12872 { 12926 12873 ktime_t remaining = ··· 13316 13253 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 13317 13254 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 13318 13255 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 13319 - if (vmx->hv_deadline_tsc == -1) 13320 - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, 13321 - PIN_BASED_VMX_PREEMPTION_TIMER); 13322 - else 13323 - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, 13324 - PIN_BASED_VMX_PREEMPTION_TIMER); 13256 + 13325 13257 if (kvm_has_tsc_control) 13326 13258 decache_tsc_multiplier(vmx); 13327 13259 ··· 13520 13462 return -ERANGE; 13521 13463 13522 13464 vmx->hv_deadline_tsc = tscl + delta_tsc; 13523 - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, 13524 - PIN_BASED_VMX_PREEMPTION_TIMER); 13525 - 13526 13465 return delta_tsc == 0; 13527 13466 } 13528 13467 13529 13468 static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) 13530 13469 { 13531 - struct vcpu_vmx *vmx = to_vmx(vcpu); 13532 - vmx->hv_deadline_tsc = -1; 13533 - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, 13534 - PIN_BASED_VMX_PREEMPTION_TIMER); 13470 + to_vmx(vcpu)->hv_deadline_tsc = -1; 13535 13471 } 13536 13472 #endif 13537 13473 ··· 14006 13954 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 14007 13955 return -EINVAL; 14008 13956 13957 + /* 13958 + * SMM temporarily disables VMX, so we cannot be in guest mode, 13959 + * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 13960 + * must be zero. 13961 + */ 13962 + if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags) 13963 + return -EINVAL; 13964 + 14009 13965 if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 14010 13966 !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 14011 13967 return -EINVAL; ··· 14157 14097 .apicv_post_state_restore = vmx_apicv_post_state_restore, 14158 14098 .hwapic_irr_update = vmx_hwapic_irr_update, 14159 14099 .hwapic_isr_update = vmx_hwapic_isr_update, 14100 + .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, 14160 14101 .sync_pir_to_irr = vmx_sync_pir_to_irr, 14161 14102 .deliver_posted_interrupt = vmx_deliver_posted_interrupt, 14162 14103 ··· 14191 14130 .umip_emulated = vmx_umip_emulated, 14192 14131 14193 14132 .check_nested_events = vmx_check_nested_events, 14133 + .request_immediate_exit = vmx_request_immediate_exit, 14194 14134 14195 14135 .sched_in = vmx_sched_in, 14196 14136
+65 -36
arch/x86/kvm/x86.c
··· 628 628 gfn_t gfn; 629 629 int r; 630 630 631 - if (is_long_mode(vcpu) || !is_pae(vcpu)) 631 + if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu)) 632 632 return false; 633 633 634 634 if (!test_bit(VCPU_EXREG_PDPTR, ··· 2537 2537 break; 2538 2538 case MSR_PLATFORM_INFO: 2539 2539 if (!msr_info->host_initiated || 2540 - data & ~MSR_PLATFORM_INFO_CPUID_FAULT || 2541 2540 (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) && 2542 2541 cpuid_fault_enabled(vcpu))) 2543 2542 return 1; ··· 2779 2780 msr_info->data = vcpu->arch.osvw.status; 2780 2781 break; 2781 2782 case MSR_PLATFORM_INFO: 2783 + if (!msr_info->host_initiated && 2784 + !vcpu->kvm->arch.guest_can_read_msr_platform_info) 2785 + return 1; 2782 2786 msr_info->data = vcpu->arch.msr_platform_info; 2783 2787 break; 2784 2788 case MSR_MISC_FEATURES_ENABLES: ··· 2929 2927 case KVM_CAP_SPLIT_IRQCHIP: 2930 2928 case KVM_CAP_IMMEDIATE_EXIT: 2931 2929 case KVM_CAP_GET_MSR_FEATURES: 2930 + case KVM_CAP_MSR_PLATFORM_INFO: 2932 2931 r = 1; 2933 2932 break; 2934 2933 case KVM_CAP_SYNC_REGS: ··· 4010 4007 break; 4011 4008 4012 4009 BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); 4010 + r = -EFAULT; 4013 4011 if (get_user(user_data_size, &user_kvm_nested_state->size)) 4014 - return -EFAULT; 4012 + break; 4015 4013 4016 4014 r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state, 4017 4015 user_data_size); 4018 4016 if (r < 0) 4019 - return r; 4017 + break; 4020 4018 4021 4019 if (r > user_data_size) { 4022 4020 if (put_user(r, &user_kvm_nested_state->size)) 4023 - return -EFAULT; 4024 - return -E2BIG; 4021 + r = -EFAULT; 4022 + else 4023 + r = -E2BIG; 4024 + break; 4025 4025 } 4026 + 4026 4027 r = 0; 4027 4028 break; 4028 4029 } ··· 4038 4031 if (!kvm_x86_ops->set_nested_state) 4039 4032 break; 4040 4033 4034 + r = -EFAULT; 4041 4035 if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state))) 4042 - return -EFAULT; 4036 + break; 4043 4037 4038 + r = -EINVAL; 4044 4039 if (kvm_state.size < sizeof(kvm_state)) 4045 - return -EINVAL; 4040 + break; 4046 4041 4047 4042 if (kvm_state.flags & 4048 4043 ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE)) 4049 - return -EINVAL; 4044 + break; 4050 4045 4051 4046 /* nested_run_pending implies guest_mode. */ 4052 4047 if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING) 4053 - return -EINVAL; 4048 + break; 4054 4049 4055 4050 r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); 4056 4051 break; ··· 4357 4348 kvm->arch.hlt_in_guest = true; 4358 4349 if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) 4359 4350 kvm->arch.pause_in_guest = true; 4351 + r = 0; 4352 + break; 4353 + case KVM_CAP_MSR_PLATFORM_INFO: 4354 + kvm->arch.guest_can_read_msr_platform_info = cap->args[0]; 4360 4355 r = 0; 4361 4356 break; 4362 4357 default: ··· 7374 7361 } 7375 7362 EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page); 7376 7363 7364 + void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) 7365 + { 7366 + smp_send_reschedule(vcpu->cpu); 7367 + } 7368 + EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit); 7369 + 7377 7370 /* 7378 7371 * Returns 1 to let vcpu_run() continue the guest execution loop without 7379 7372 * exiting to the userspace. Otherwise, the value will be returned to the ··· 7584 7565 7585 7566 if (req_immediate_exit) { 7586 7567 kvm_make_request(KVM_REQ_EVENT, vcpu); 7587 - smp_send_reschedule(vcpu->cpu); 7568 + kvm_x86_ops->request_immediate_exit(vcpu); 7588 7569 } 7589 7570 7590 7571 trace_kvm_entry(vcpu->vcpu_id); ··· 7846 7827 run->mmio.is_write = vcpu->mmio_is_write; 7847 7828 vcpu->arch.complete_userspace_io = complete_emulated_mmio; 7848 7829 return 0; 7830 + } 7831 + 7832 + /* Swap (qemu) user FPU context for the guest FPU context. */ 7833 + static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 7834 + { 7835 + preempt_disable(); 7836 + copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); 7837 + /* PKRU is separately restored in kvm_x86_ops->run. */ 7838 + __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, 7839 + ~XFEATURE_MASK_PKRU); 7840 + preempt_enable(); 7841 + trace_kvm_fpu(1); 7842 + } 7843 + 7844 + /* When vcpu_run ends, restore user space FPU context. */ 7845 + static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 7846 + { 7847 + preempt_disable(); 7848 + copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); 7849 + copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); 7850 + preempt_enable(); 7851 + ++vcpu->stat.fpu_reload; 7852 + trace_kvm_fpu(0); 7849 7853 } 7850 7854 7851 7855 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ··· 8219 8177 kvm_update_cpuid(vcpu); 8220 8178 8221 8179 idx = srcu_read_lock(&vcpu->kvm->srcu); 8222 - if (!is_long_mode(vcpu) && is_pae(vcpu)) { 8180 + if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) { 8223 8181 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); 8224 8182 mmu_reset_needed = 1; 8225 8183 } ··· 8446 8404 vcpu->arch.xcr0 = XFEATURE_MASK_FP; 8447 8405 8448 8406 vcpu->arch.cr0 |= X86_CR0_ET; 8449 - } 8450 - 8451 - /* Swap (qemu) user FPU context for the guest FPU context. */ 8452 - void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 8453 - { 8454 - preempt_disable(); 8455 - copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); 8456 - /* PKRU is separately restored in kvm_x86_ops->run. */ 8457 - __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, 8458 - ~XFEATURE_MASK_PKRU); 8459 - preempt_enable(); 8460 - trace_kvm_fpu(1); 8461 - } 8462 - 8463 - /* When vcpu_run ends, restore user space FPU context. */ 8464 - void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 8465 - { 8466 - preempt_disable(); 8467 - copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); 8468 - copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); 8469 - preempt_enable(); 8470 - ++vcpu->stat.fpu_reload; 8471 - trace_kvm_fpu(0); 8472 8407 } 8473 8408 8474 8409 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) ··· 8871 8852 kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); 8872 8853 pvclock_update_vm_gtod_copy(kvm); 8873 8854 8855 + kvm->arch.guest_can_read_msr_platform_info = true; 8856 + 8874 8857 INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); 8875 8858 INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); 8876 8859 ··· 9221 9200 kvm_page_track_flush_slot(kvm, slot); 9222 9201 } 9223 9202 9203 + static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) 9204 + { 9205 + return (is_guest_mode(vcpu) && 9206 + kvm_x86_ops->guest_apic_has_interrupt && 9207 + kvm_x86_ops->guest_apic_has_interrupt(vcpu)); 9208 + } 9209 + 9224 9210 static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) 9225 9211 { 9226 9212 if (!list_empty_careful(&vcpu->async_pf.done)) ··· 9252 9224 return true; 9253 9225 9254 9226 if (kvm_arch_interrupt_allowed(vcpu) && 9255 - kvm_cpu_has_interrupt(vcpu)) 9227 + (kvm_cpu_has_interrupt(vcpu) || 9228 + kvm_guest_apic_has_interrupt(vcpu))) 9256 9229 return true; 9257 9230 9258 9231 if (kvm_hv_has_stimer_pending(vcpu))
-2
include/linux/kvm_host.h
··· 733 733 void kvm_vcpu_kick(struct kvm_vcpu *vcpu); 734 734 int kvm_vcpu_yield_to(struct kvm_vcpu *target); 735 735 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool usermode_vcpu_not_eligible); 736 - void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); 737 - void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); 738 736 739 737 void kvm_flush_remote_tlbs(struct kvm *kvm); 740 738 void kvm_reload_remote_mmus(struct kvm *kvm);
+1
include/uapi/linux/kvm.h
··· 952 952 #define KVM_CAP_S390_HPAGE_1M 156 953 953 #define KVM_CAP_NESTED_STATE 157 954 954 #define KVM_CAP_ARM_INJECT_SERROR_ESR 158 955 + #define KVM_CAP_MSR_PLATFORM_INFO 159 955 956 956 957 #ifdef KVM_CAP_IRQ_ROUTING 957 958
+1
tools/testing/selftests/kvm/.gitignore
··· 1 1 cr4_cpuid_sync_test 2 + platform_info_test 2 3 set_sregs_test 3 4 sync_regs_test 4 5 vmx_tsc_adjust_test
+3 -2
tools/testing/selftests/kvm/Makefile
··· 6 6 LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c 7 7 LIBKVM_x86_64 = lib/x86.c lib/vmx.c 8 8 9 - TEST_GEN_PROGS_x86_64 = set_sregs_test 9 + TEST_GEN_PROGS_x86_64 = platform_info_test 10 + TEST_GEN_PROGS_x86_64 += set_sregs_test 10 11 TEST_GEN_PROGS_x86_64 += sync_regs_test 11 12 TEST_GEN_PROGS_x86_64 += vmx_tsc_adjust_test 12 13 TEST_GEN_PROGS_x86_64 += cr4_cpuid_sync_test ··· 21 20 LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/ 22 21 LINUX_TOOL_INCLUDE = $(top_srcdir)tools/include 23 22 CFLAGS += -O2 -g -std=gnu99 -I$(LINUX_TOOL_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -I.. 24 - LDFLAGS += -lpthread 23 + LDFLAGS += -pthread 25 24 26 25 # After inclusion, $(OUTPUT) is defined and 27 26 # $(TEST_GEN_PROGS) starts with $(OUTPUT)/
+4
tools/testing/selftests/kvm/include/kvm_util.h
··· 50 50 }; 51 51 52 52 int kvm_check_cap(long cap); 53 + int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); 53 54 54 55 struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); 55 56 void kvm_vm_free(struct kvm_vm *vmp); ··· 109 108 struct kvm_vcpu_events *events); 110 109 void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, 111 110 struct kvm_vcpu_events *events); 111 + uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index); 112 + void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, 113 + uint64_t msr_value); 112 114 113 115 const char *exit_reason_str(unsigned int exit_reason); 114 116
+89
tools/testing/selftests/kvm/lib/kvm_util.c
··· 63 63 return ret; 64 64 } 65 65 66 + /* VM Enable Capability 67 + * 68 + * Input Args: 69 + * vm - Virtual Machine 70 + * cap - Capability 71 + * 72 + * Output Args: None 73 + * 74 + * Return: On success, 0. On failure a TEST_ASSERT failure is produced. 75 + * 76 + * Enables a capability (KVM_CAP_*) on the VM. 77 + */ 78 + int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) 79 + { 80 + int ret; 81 + 82 + ret = ioctl(vm->fd, KVM_ENABLE_CAP, cap); 83 + TEST_ASSERT(ret == 0, "KVM_ENABLE_CAP IOCTL failed,\n" 84 + " rc: %i errno: %i", ret, errno); 85 + 86 + return ret; 87 + } 88 + 66 89 static void vm_open(struct kvm_vm *vm, int perm) 67 90 { 68 91 vm->kvm_fd = open(KVM_DEV_PATH, perm); ··· 1241 1218 ret = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, events); 1242 1219 TEST_ASSERT(ret == 0, "KVM_SET_VCPU_EVENTS, failed, rc: %i errno: %i", 1243 1220 ret, errno); 1221 + } 1222 + 1223 + /* VCPU Get MSR 1224 + * 1225 + * Input Args: 1226 + * vm - Virtual Machine 1227 + * vcpuid - VCPU ID 1228 + * msr_index - Index of MSR 1229 + * 1230 + * Output Args: None 1231 + * 1232 + * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced. 1233 + * 1234 + * Get value of MSR for VCPU. 1235 + */ 1236 + uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index) 1237 + { 1238 + struct vcpu *vcpu = vcpu_find(vm, vcpuid); 1239 + struct { 1240 + struct kvm_msrs header; 1241 + struct kvm_msr_entry entry; 1242 + } buffer = {}; 1243 + int r; 1244 + 1245 + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); 1246 + buffer.header.nmsrs = 1; 1247 + buffer.entry.index = msr_index; 1248 + r = ioctl(vcpu->fd, KVM_GET_MSRS, &buffer.header); 1249 + TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n" 1250 + " rc: %i errno: %i", r, errno); 1251 + 1252 + return buffer.entry.data; 1253 + } 1254 + 1255 + /* VCPU Set MSR 1256 + * 1257 + * Input Args: 1258 + * vm - Virtual Machine 1259 + * vcpuid - VCPU ID 1260 + * msr_index - Index of MSR 1261 + * msr_value - New value of MSR 1262 + * 1263 + * Output Args: None 1264 + * 1265 + * Return: On success, nothing. On failure a TEST_ASSERT is produced. 1266 + * 1267 + * Set value of MSR for VCPU. 1268 + */ 1269 + void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, 1270 + uint64_t msr_value) 1271 + { 1272 + struct vcpu *vcpu = vcpu_find(vm, vcpuid); 1273 + struct { 1274 + struct kvm_msrs header; 1275 + struct kvm_msr_entry entry; 1276 + } buffer = {}; 1277 + int r; 1278 + 1279 + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); 1280 + memset(&buffer, 0, sizeof(buffer)); 1281 + buffer.header.nmsrs = 1; 1282 + buffer.entry.index = msr_index; 1283 + buffer.entry.data = msr_value; 1284 + r = ioctl(vcpu->fd, KVM_SET_MSRS, &buffer.header); 1285 + TEST_ASSERT(r == 1, "KVM_SET_MSRS IOCTL failed,\n" 1286 + " rc: %i errno: %i", r, errno); 1244 1287 } 1245 1288 1246 1289 /* VM VCPU Args Set
+110
tools/testing/selftests/kvm/platform_info_test.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Test for x86 KVM_CAP_MSR_PLATFORM_INFO 4 + * 5 + * Copyright (C) 2018, Google LLC. 6 + * 7 + * This work is licensed under the terms of the GNU GPL, version 2. 8 + * 9 + * Verifies expected behavior of controlling guest access to 10 + * MSR_PLATFORM_INFO. 11 + */ 12 + 13 + #define _GNU_SOURCE /* for program_invocation_short_name */ 14 + #include <fcntl.h> 15 + #include <stdio.h> 16 + #include <stdlib.h> 17 + #include <string.h> 18 + #include <sys/ioctl.h> 19 + 20 + #include "test_util.h" 21 + #include "kvm_util.h" 22 + #include "x86.h" 23 + 24 + #define VCPU_ID 0 25 + #define MSR_PLATFORM_INFO_MAX_TURBO_RATIO 0xff00 26 + 27 + static void guest_code(void) 28 + { 29 + uint64_t msr_platform_info; 30 + 31 + for (;;) { 32 + msr_platform_info = rdmsr(MSR_PLATFORM_INFO); 33 + GUEST_SYNC(msr_platform_info); 34 + asm volatile ("inc %r11"); 35 + } 36 + } 37 + 38 + static void set_msr_platform_info_enabled(struct kvm_vm *vm, bool enable) 39 + { 40 + struct kvm_enable_cap cap = {}; 41 + 42 + cap.cap = KVM_CAP_MSR_PLATFORM_INFO; 43 + cap.flags = 0; 44 + cap.args[0] = (int)enable; 45 + vm_enable_cap(vm, &cap); 46 + } 47 + 48 + static void test_msr_platform_info_enabled(struct kvm_vm *vm) 49 + { 50 + struct kvm_run *run = vcpu_state(vm, VCPU_ID); 51 + struct guest_args args; 52 + 53 + set_msr_platform_info_enabled(vm, true); 54 + vcpu_run(vm, VCPU_ID); 55 + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, 56 + "Exit_reason other than KVM_EXIT_IO: %u (%s),\n", 57 + run->exit_reason, 58 + exit_reason_str(run->exit_reason)); 59 + guest_args_read(vm, VCPU_ID, &args); 60 + TEST_ASSERT(args.port == GUEST_PORT_SYNC, 61 + "Received IO from port other than PORT_HOST_SYNC: %u\n", 62 + run->io.port); 63 + TEST_ASSERT((args.arg1 & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) == 64 + MSR_PLATFORM_INFO_MAX_TURBO_RATIO, 65 + "Expected MSR_PLATFORM_INFO to have max turbo ratio mask: %i.", 66 + MSR_PLATFORM_INFO_MAX_TURBO_RATIO); 67 + } 68 + 69 + static void test_msr_platform_info_disabled(struct kvm_vm *vm) 70 + { 71 + struct kvm_run *run = vcpu_state(vm, VCPU_ID); 72 + 73 + set_msr_platform_info_enabled(vm, false); 74 + vcpu_run(vm, VCPU_ID); 75 + TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN, 76 + "Exit_reason other than KVM_EXIT_SHUTDOWN: %u (%s)\n", 77 + run->exit_reason, 78 + exit_reason_str(run->exit_reason)); 79 + } 80 + 81 + int main(int argc, char *argv[]) 82 + { 83 + struct kvm_vm *vm; 84 + struct kvm_run *state; 85 + int rv; 86 + uint64_t msr_platform_info; 87 + 88 + /* Tell stdout not to buffer its content */ 89 + setbuf(stdout, NULL); 90 + 91 + rv = kvm_check_cap(KVM_CAP_MSR_PLATFORM_INFO); 92 + if (!rv) { 93 + fprintf(stderr, 94 + "KVM_CAP_MSR_PLATFORM_INFO not supported, skip test\n"); 95 + exit(KSFT_SKIP); 96 + } 97 + 98 + vm = vm_create_default(VCPU_ID, 0, guest_code); 99 + 100 + msr_platform_info = vcpu_get_msr(vm, VCPU_ID, MSR_PLATFORM_INFO); 101 + vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, 102 + msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO); 103 + test_msr_platform_info_disabled(vm); 104 + test_msr_platform_info_enabled(vm); 105 + vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, msr_platform_info); 106 + 107 + kvm_vm_free(vm); 108 + 109 + return 0; 110 + }