Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
"This is a pretty large update. I think it is roughly as big as what I
usually had for the _whole_ rc period.

There are a few bad bugs where the guest can OOPS or crash the host.
We have also started looking at attack models for nested
virtualization; bugs that usually result in the guest ring 0 crashing
itself become more worrisome if you have nested virtualization,
because the nested guest might bring down the non-nested guest as
well. For current uses of nested virtualization these do not really
have a security impact, but you never know and bugs are bugs
nevertheless.

A lot of these bugs are in 3.17 too, resulting in a large number of
stable@ Ccs. I checked that all the patches apply there with no
conflicts"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
kvm: vfio: fix unregister kvm_device_ops of vfio
KVM: x86: Wrong assertion on paging_tmpl.h
kvm: fix excessive pages un-pinning in kvm_iommu_map error path.
KVM: x86: PREFETCH and HINT_NOP should have SrcMem flag
KVM: x86: Emulator does not decode clflush well
KVM: emulate: avoid accessing NULL ctxt->memopp
KVM: x86: Decoding guest instructions which cross page boundary may fail
kvm: x86: don't kill guest on unknown exit reason
kvm: vmx: handle invvpid vm exit gracefully
KVM: x86: Handle errors when RIP is set during far jumps
KVM: x86: Emulator fixes for eip canonical checks on near branches
KVM: x86: Fix wrong masking on relative jump/call
KVM: x86: Improve thread safety in pit
KVM: x86: Prevent host from panicking on shared MSR writes.
KVM: x86: Check non-canonical addresses upon WRMSR

+283 -86
+15 -1
arch/x86/include/asm/kvm_host.h
··· 989 989 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 990 990 } 991 991 992 + static inline u64 get_canonical(u64 la) 993 + { 994 + return ((int64_t)la << 16) >> 16; 995 + } 996 + 997 + static inline bool is_noncanonical_address(u64 la) 998 + { 999 + #ifdef CONFIG_X86_64 1000 + return get_canonical(la) != la; 1001 + #else 1002 + return false; 1003 + #endif 1004 + } 1005 + 992 1006 #define TSS_IOPB_BASE_OFFSET 0x66 993 1007 #define TSS_BASE_SIZE 0x68 994 1008 #define TSS_IOPB_SIZE (65536 / 8) ··· 1064 1050 unsigned long address); 1065 1051 1066 1052 void kvm_define_shared_msr(unsigned index, u32 msr); 1067 - void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); 1053 + int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); 1068 1054 1069 1055 bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); 1070 1056
+2
arch/x86/include/uapi/asm/vmx.h
··· 67 67 #define EXIT_REASON_EPT_MISCONFIG 49 68 68 #define EXIT_REASON_INVEPT 50 69 69 #define EXIT_REASON_PREEMPTION_TIMER 52 70 + #define EXIT_REASON_INVVPID 53 70 71 #define EXIT_REASON_WBINVD 54 71 72 #define EXIT_REASON_XSETBV 55 72 73 #define EXIT_REASON_APIC_WRITE 56 ··· 115 114 { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ 116 115 { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ 117 116 { EXIT_REASON_INVD, "INVD" }, \ 117 + { EXIT_REASON_INVVPID, "INVVPID" }, \ 118 118 { EXIT_REASON_INVPCID, "INVPCID" } 119 119 120 120 #endif /* _UAPIVMX_H */
+187 -65
arch/x86/kvm/emulate.c
··· 504 504 masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc); 505 505 } 506 506 507 - static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) 508 - { 509 - register_address_increment(ctxt, &ctxt->_eip, rel); 510 - } 511 - 512 507 static u32 desc_limit_scaled(struct desc_struct *desc) 513 508 { 514 509 u32 limit = get_desc_limit(desc); ··· 562 567 static int emulate_nm(struct x86_emulate_ctxt *ctxt) 563 568 { 564 569 return emulate_exception(ctxt, NM_VECTOR, 0, false); 570 + } 571 + 572 + static inline int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, 573 + int cs_l) 574 + { 575 + switch (ctxt->op_bytes) { 576 + case 2: 577 + ctxt->_eip = (u16)dst; 578 + break; 579 + case 4: 580 + ctxt->_eip = (u32)dst; 581 + break; 582 + case 8: 583 + if ((cs_l && is_noncanonical_address(dst)) || 584 + (!cs_l && (dst & ~(u32)-1))) 585 + return emulate_gp(ctxt, 0); 586 + ctxt->_eip = dst; 587 + break; 588 + default: 589 + WARN(1, "unsupported eip assignment size\n"); 590 + } 591 + return X86EMUL_CONTINUE; 592 + } 593 + 594 + static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) 595 + { 596 + return assign_eip_far(ctxt, dst, ctxt->mode == X86EMUL_MODE_PROT64); 597 + } 598 + 599 + static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) 600 + { 601 + return assign_eip_near(ctxt, ctxt->_eip + rel); 565 602 } 566 603 567 604 static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg) ··· 778 751 static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, 779 752 unsigned size) 780 753 { 781 - if (unlikely(ctxt->fetch.end - ctxt->fetch.ptr < size)) 782 - return __do_insn_fetch_bytes(ctxt, size); 754 + unsigned done_size = ctxt->fetch.end - ctxt->fetch.ptr; 755 + 756 + if (unlikely(done_size < size)) 757 + return __do_insn_fetch_bytes(ctxt, size - done_size); 783 758 else 784 759 return X86EMUL_CONTINUE; 785 760 } ··· 1445 1416 1446 1417 /* Does not support long mode */ 1447 1418 static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1448 - u16 selector, int seg, u8 cpl, bool in_task_switch) 1419 + u16 selector, int seg, u8 cpl, 1420 + bool in_task_switch, 1421 + struct desc_struct *desc) 1449 1422 { 1450 1423 struct desc_struct seg_desc, old_desc; 1451 1424 u8 dpl, rpl; ··· 1588 1557 } 1589 1558 load: 1590 1559 ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg); 1560 + if (desc) 1561 + *desc = seg_desc; 1591 1562 return X86EMUL_CONTINUE; 1592 1563 exception: 1593 1564 return emulate_exception(ctxt, err_vec, err_code, true); ··· 1599 1566 u16 selector, int seg) 1600 1567 { 1601 1568 u8 cpl = ctxt->ops->cpl(ctxt); 1602 - return __load_segment_descriptor(ctxt, selector, seg, cpl, false); 1569 + return __load_segment_descriptor(ctxt, selector, seg, cpl, false, NULL); 1603 1570 } 1604 1571 1605 1572 static void write_register_operand(struct operand *op) ··· 1993 1960 static int em_jmp_far(struct x86_emulate_ctxt *ctxt) 1994 1961 { 1995 1962 int rc; 1996 - unsigned short sel; 1963 + unsigned short sel, old_sel; 1964 + struct desc_struct old_desc, new_desc; 1965 + const struct x86_emulate_ops *ops = ctxt->ops; 1966 + u8 cpl = ctxt->ops->cpl(ctxt); 1967 + 1968 + /* Assignment of RIP may only fail in 64-bit mode */ 1969 + if (ctxt->mode == X86EMUL_MODE_PROT64) 1970 + ops->get_segment(ctxt, &old_sel, &old_desc, NULL, 1971 + VCPU_SREG_CS); 1997 1972 1998 1973 memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); 1999 1974 2000 - rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS); 1975 + rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, 1976 + &new_desc); 2001 1977 if (rc != X86EMUL_CONTINUE) 2002 1978 return rc; 2003 1979 2004 - ctxt->_eip = 0; 2005 - memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes); 2006 - return X86EMUL_CONTINUE; 1980 + rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l); 1981 + if (rc != X86EMUL_CONTINUE) { 1982 + WARN_ON(!ctxt->mode != X86EMUL_MODE_PROT64); 1983 + /* assigning eip failed; restore the old cs */ 1984 + ops->set_segment(ctxt, old_sel, &old_desc, 0, VCPU_SREG_CS); 1985 + return rc; 1986 + } 1987 + return rc; 2007 1988 } 2008 1989 2009 1990 static int em_grp45(struct x86_emulate_ctxt *ctxt) ··· 2028 1981 case 2: /* call near abs */ { 2029 1982 long int old_eip; 2030 1983 old_eip = ctxt->_eip; 2031 - ctxt->_eip = ctxt->src.val; 1984 + rc = assign_eip_near(ctxt, ctxt->src.val); 1985 + if (rc != X86EMUL_CONTINUE) 1986 + break; 2032 1987 ctxt->src.val = old_eip; 2033 1988 rc = em_push(ctxt); 2034 1989 break; 2035 1990 } 2036 1991 case 4: /* jmp abs */ 2037 - ctxt->_eip = ctxt->src.val; 1992 + rc = assign_eip_near(ctxt, ctxt->src.val); 2038 1993 break; 2039 1994 case 5: /* jmp far */ 2040 1995 rc = em_jmp_far(ctxt); ··· 2071 2022 2072 2023 static int em_ret(struct x86_emulate_ctxt *ctxt) 2073 2024 { 2074 - ctxt->dst.type = OP_REG; 2075 - ctxt->dst.addr.reg = &ctxt->_eip; 2076 - ctxt->dst.bytes = ctxt->op_bytes; 2077 - return em_pop(ctxt); 2025 + int rc; 2026 + unsigned long eip; 2027 + 2028 + rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); 2029 + if (rc != X86EMUL_CONTINUE) 2030 + return rc; 2031 + 2032 + return assign_eip_near(ctxt, eip); 2078 2033 } 2079 2034 2080 2035 static int em_ret_far(struct x86_emulate_ctxt *ctxt) 2081 2036 { 2082 2037 int rc; 2083 - unsigned long cs; 2038 + unsigned long eip, cs; 2039 + u16 old_cs; 2084 2040 int cpl = ctxt->ops->cpl(ctxt); 2041 + struct desc_struct old_desc, new_desc; 2042 + const struct x86_emulate_ops *ops = ctxt->ops; 2085 2043 2086 - rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes); 2044 + if (ctxt->mode == X86EMUL_MODE_PROT64) 2045 + ops->get_segment(ctxt, &old_cs, &old_desc, NULL, 2046 + VCPU_SREG_CS); 2047 + 2048 + rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); 2087 2049 if (rc != X86EMUL_CONTINUE) 2088 2050 return rc; 2089 - if (ctxt->op_bytes == 4) 2090 - ctxt->_eip = (u32)ctxt->_eip; 2091 2051 rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); 2092 2052 if (rc != X86EMUL_CONTINUE) 2093 2053 return rc; 2094 2054 /* Outer-privilege level return is not implemented */ 2095 2055 if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl) 2096 2056 return X86EMUL_UNHANDLEABLE; 2097 - rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS); 2057 + rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, 0, false, 2058 + &new_desc); 2059 + if (rc != X86EMUL_CONTINUE) 2060 + return rc; 2061 + rc = assign_eip_far(ctxt, eip, new_desc.l); 2062 + if (rc != X86EMUL_CONTINUE) { 2063 + WARN_ON(!ctxt->mode != X86EMUL_MODE_PROT64); 2064 + ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); 2065 + } 2098 2066 return rc; 2099 2067 } 2100 2068 ··· 2372 2306 { 2373 2307 const struct x86_emulate_ops *ops = ctxt->ops; 2374 2308 struct desc_struct cs, ss; 2375 - u64 msr_data; 2309 + u64 msr_data, rcx, rdx; 2376 2310 int usermode; 2377 2311 u16 cs_sel = 0, ss_sel = 0; 2378 2312 ··· 2387 2321 usermode = X86EMUL_MODE_PROT64; 2388 2322 else 2389 2323 usermode = X86EMUL_MODE_PROT32; 2324 + 2325 + rcx = reg_read(ctxt, VCPU_REGS_RCX); 2326 + rdx = reg_read(ctxt, VCPU_REGS_RDX); 2390 2327 2391 2328 cs.dpl = 3; 2392 2329 ss.dpl = 3; ··· 2408 2339 ss_sel = cs_sel + 8; 2409 2340 cs.d = 0; 2410 2341 cs.l = 1; 2342 + if (is_noncanonical_address(rcx) || 2343 + is_noncanonical_address(rdx)) 2344 + return emulate_gp(ctxt, 0); 2411 2345 break; 2412 2346 } 2413 2347 cs_sel |= SELECTOR_RPL_MASK; ··· 2419 2347 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); 2420 2348 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); 2421 2349 2422 - ctxt->_eip = reg_read(ctxt, VCPU_REGS_RDX); 2423 - *reg_write(ctxt, VCPU_REGS_RSP) = reg_read(ctxt, VCPU_REGS_RCX); 2350 + ctxt->_eip = rdx; 2351 + *reg_write(ctxt, VCPU_REGS_RSP) = rcx; 2424 2352 2425 2353 return X86EMUL_CONTINUE; 2426 2354 } ··· 2538 2466 * Now load segment descriptors. If fault happens at this stage 2539 2467 * it is handled in a context of new task 2540 2468 */ 2541 - ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, true); 2469 + ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, 2470 + true, NULL); 2542 2471 if (ret != X86EMUL_CONTINUE) 2543 2472 return ret; 2544 - ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); 2473 + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, 2474 + true, NULL); 2545 2475 if (ret != X86EMUL_CONTINUE) 2546 2476 return ret; 2547 - ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); 2477 + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, 2478 + true, NULL); 2548 2479 if (ret != X86EMUL_CONTINUE) 2549 2480 return ret; 2550 - ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); 2481 + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, 2482 + true, NULL); 2551 2483 if (ret != X86EMUL_CONTINUE) 2552 2484 return ret; 2553 - ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); 2485 + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, 2486 + true, NULL); 2554 2487 if (ret != X86EMUL_CONTINUE) 2555 2488 return ret; 2556 2489 ··· 2680 2603 * Now load segment descriptors. If fault happenes at this stage 2681 2604 * it is handled in a context of new task 2682 2605 */ 2683 - ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl, true); 2606 + ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, 2607 + cpl, true, NULL); 2684 2608 if (ret != X86EMUL_CONTINUE) 2685 2609 return ret; 2686 - ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); 2610 + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, 2611 + true, NULL); 2687 2612 if (ret != X86EMUL_CONTINUE) 2688 2613 return ret; 2689 - ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); 2614 + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, 2615 + true, NULL); 2690 2616 if (ret != X86EMUL_CONTINUE) 2691 2617 return ret; 2692 - ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); 2618 + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, 2619 + true, NULL); 2693 2620 if (ret != X86EMUL_CONTINUE) 2694 2621 return ret; 2695 - ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); 2622 + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, 2623 + true, NULL); 2696 2624 if (ret != X86EMUL_CONTINUE) 2697 2625 return ret; 2698 - ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, true); 2626 + ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, 2627 + true, NULL); 2699 2628 if (ret != X86EMUL_CONTINUE) 2700 2629 return ret; 2701 - ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, true); 2630 + ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, 2631 + true, NULL); 2702 2632 if (ret != X86EMUL_CONTINUE) 2703 2633 return ret; 2704 2634 ··· 2972 2888 2973 2889 static int em_call(struct x86_emulate_ctxt *ctxt) 2974 2890 { 2891 + int rc; 2975 2892 long rel = ctxt->src.val; 2976 2893 2977 2894 ctxt->src.val = (unsigned long)ctxt->_eip; 2978 - jmp_rel(ctxt, rel); 2895 + rc = jmp_rel(ctxt, rel); 2896 + if (rc != X86EMUL_CONTINUE) 2897 + return rc; 2979 2898 return em_push(ctxt); 2980 2899 } 2981 2900 ··· 2987 2900 u16 sel, old_cs; 2988 2901 ulong old_eip; 2989 2902 int rc; 2903 + struct desc_struct old_desc, new_desc; 2904 + const struct x86_emulate_ops *ops = ctxt->ops; 2905 + int cpl = ctxt->ops->cpl(ctxt); 2990 2906 2991 - old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); 2992 2907 old_eip = ctxt->_eip; 2908 + ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS); 2993 2909 2994 2910 memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); 2995 - if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS)) 2911 + rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, 2912 + &new_desc); 2913 + if (rc != X86EMUL_CONTINUE) 2996 2914 return X86EMUL_CONTINUE; 2997 2915 2998 - ctxt->_eip = 0; 2999 - memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes); 2916 + rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l); 2917 + if (rc != X86EMUL_CONTINUE) 2918 + goto fail; 3000 2919 3001 2920 ctxt->src.val = old_cs; 3002 2921 rc = em_push(ctxt); 3003 2922 if (rc != X86EMUL_CONTINUE) 3004 - return rc; 2923 + goto fail; 3005 2924 3006 2925 ctxt->src.val = old_eip; 3007 - return em_push(ctxt); 2926 + rc = em_push(ctxt); 2927 + /* If we failed, we tainted the memory, but the very least we should 2928 + restore cs */ 2929 + if (rc != X86EMUL_CONTINUE) 2930 + goto fail; 2931 + return rc; 2932 + fail: 2933 + ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); 2934 + return rc; 2935 + 3008 2936 } 3009 2937 3010 2938 static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) 3011 2939 { 3012 2940 int rc; 2941 + unsigned long eip; 3013 2942 3014 - ctxt->dst.type = OP_REG; 3015 - ctxt->dst.addr.reg = &ctxt->_eip; 3016 - ctxt->dst.bytes = ctxt->op_bytes; 3017 - rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); 2943 + rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); 2944 + if (rc != X86EMUL_CONTINUE) 2945 + return rc; 2946 + rc = assign_eip_near(ctxt, eip); 3018 2947 if (rc != X86EMUL_CONTINUE) 3019 2948 return rc; 3020 2949 rsp_increment(ctxt, ctxt->src.val); ··· 3357 3254 3358 3255 static int em_loop(struct x86_emulate_ctxt *ctxt) 3359 3256 { 3257 + int rc = X86EMUL_CONTINUE; 3258 + 3360 3259 register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1); 3361 3260 if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) && 3362 3261 (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) 3363 - jmp_rel(ctxt, ctxt->src.val); 3262 + rc = jmp_rel(ctxt, ctxt->src.val); 3364 3263 3365 - return X86EMUL_CONTINUE; 3264 + return rc; 3366 3265 } 3367 3266 3368 3267 static int em_jcxz(struct x86_emulate_ctxt *ctxt) 3369 3268 { 3370 - if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) 3371 - jmp_rel(ctxt, ctxt->src.val); 3269 + int rc = X86EMUL_CONTINUE; 3372 3270 3373 - return X86EMUL_CONTINUE; 3271 + if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) 3272 + rc = jmp_rel(ctxt, ctxt->src.val); 3273 + 3274 + return rc; 3374 3275 } 3375 3276 3376 3277 static int em_in(struct x86_emulate_ctxt *ctxt) ··· 3459 3352 asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val)); 3460 3353 break; 3461 3354 } 3355 + return X86EMUL_CONTINUE; 3356 + } 3357 + 3358 + static int em_clflush(struct x86_emulate_ctxt *ctxt) 3359 + { 3360 + /* emulating clflush regardless of cpuid */ 3462 3361 return X86EMUL_CONTINUE; 3463 3362 } 3464 3363 ··· 3806 3693 X7(D(Undefined)), 3807 3694 }; 3808 3695 3696 + static const struct gprefix pfx_0f_ae_7 = { 3697 + I(SrcMem | ByteOp, em_clflush), N, N, N, 3698 + }; 3699 + 3700 + static const struct group_dual group15 = { { 3701 + N, N, N, N, N, N, N, GP(0, &pfx_0f_ae_7), 3702 + }, { 3703 + N, N, N, N, N, N, N, N, 3704 + } }; 3705 + 3809 3706 static const struct gprefix pfx_0f_6f_0f_7f = { 3810 3707 I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), 3811 3708 }; ··· 4024 3901 N, I(ImplicitOps | EmulateOnUD, em_syscall), 4025 3902 II(ImplicitOps | Priv, em_clts, clts), N, 4026 3903 DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, 4027 - N, D(ImplicitOps | ModRM), N, N, 3904 + N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, 4028 3905 /* 0x10 - 0x1F */ 4029 3906 N, N, N, N, N, N, N, N, 4030 - D(ImplicitOps | ModRM), N, N, N, N, N, N, D(ImplicitOps | ModRM), 3907 + D(ImplicitOps | ModRM | SrcMem | NoAccess), 3908 + N, N, N, N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), 4031 3909 /* 0x20 - 0x2F */ 4032 3910 DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read), 4033 3911 DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read), ··· 4080 3956 F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), 4081 3957 F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), 4082 3958 F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), 4083 - D(ModRM), F(DstReg | SrcMem | ModRM, em_imul), 3959 + GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul), 4084 3960 /* 0xB0 - 0xB7 */ 4085 3961 I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), 4086 3962 I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), ··· 4597 4473 /* Decode and fetch the destination operand: register or memory. */ 4598 4474 rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask); 4599 4475 4600 - done: 4601 4476 if (ctxt->rip_relative) 4602 4477 ctxt->memopp->addr.mem.ea += ctxt->_eip; 4603 4478 4479 + done: 4604 4480 return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; 4605 4481 } 4606 4482 ··· 4850 4726 break; 4851 4727 case 0x70 ... 0x7f: /* jcc (short) */ 4852 4728 if (test_cc(ctxt->b, ctxt->eflags)) 4853 - jmp_rel(ctxt, ctxt->src.val); 4729 + rc = jmp_rel(ctxt, ctxt->src.val); 4854 4730 break; 4855 4731 case 0x8d: /* lea r16/r32, m */ 4856 4732 ctxt->dst.val = ctxt->src.addr.mem.ea; ··· 4880 4756 break; 4881 4757 case 0xe9: /* jmp rel */ 4882 4758 case 0xeb: /* jmp rel short */ 4883 - jmp_rel(ctxt, ctxt->src.val); 4759 + rc = jmp_rel(ctxt, ctxt->src.val); 4884 4760 ctxt->dst.type = OP_NONE; /* Disable writeback. */ 4885 4761 break; 4886 4762 case 0xf4: /* hlt */ ··· 5005 4881 break; 5006 4882 case 0x80 ... 0x8f: /* jnz rel, etc*/ 5007 4883 if (test_cc(ctxt->b, ctxt->eflags)) 5008 - jmp_rel(ctxt, ctxt->src.val); 4884 + rc = jmp_rel(ctxt, ctxt->src.val); 5009 4885 break; 5010 4886 case 0x90 ... 0x9f: /* setcc r/m8 */ 5011 4887 ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); 5012 - break; 5013 - case 0xae: /* clflush */ 5014 4888 break; 5015 4889 case 0xb6 ... 0xb7: /* movzx */ 5016 4890 ctxt->dst.bytes = ctxt->op_bytes;
+2
arch/x86/kvm/i8254.c
··· 262 262 return; 263 263 264 264 timer = &pit->pit_state.timer; 265 + mutex_lock(&pit->pit_state.lock); 265 266 if (hrtimer_cancel(timer)) 266 267 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 268 + mutex_unlock(&pit->pit_state.lock); 267 269 } 268 270 269 271 static void destroy_pit_timer(struct kvm_pit *pit)
+1 -1
arch/x86/kvm/paging_tmpl.h
··· 298 298 } 299 299 #endif 300 300 walker->max_level = walker->level; 301 - ASSERT(!is_long_mode(vcpu) && is_pae(vcpu)); 301 + ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu))); 302 302 303 303 accessed_dirty = PT_GUEST_ACCESSED_MASK; 304 304 pt_access = pte_access = ACC_ALL;
+4 -4
arch/x86/kvm/svm.c
··· 3251 3251 msr.host_initiated = false; 3252 3252 3253 3253 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 3254 - if (svm_set_msr(&svm->vcpu, &msr)) { 3254 + if (kvm_set_msr(&svm->vcpu, &msr)) { 3255 3255 trace_kvm_msr_write_ex(ecx, data); 3256 3256 kvm_inject_gp(&svm->vcpu, 0); 3257 3257 } else { ··· 3551 3551 3552 3552 if (exit_code >= ARRAY_SIZE(svm_exit_handlers) 3553 3553 || !svm_exit_handlers[exit_code]) { 3554 - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3555 - kvm_run->hw.hardware_exit_reason = exit_code; 3556 - return 0; 3554 + WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_code); 3555 + kvm_queue_exception(vcpu, UD_VECTOR); 3556 + return 1; 3557 3557 } 3558 3558 3559 3559 return svm_exit_handlers[exit_code](svm);
+17 -7
arch/x86/kvm/vmx.c
··· 2659 2659 default: 2660 2660 msr = find_msr_entry(vmx, msr_index); 2661 2661 if (msr) { 2662 + u64 old_msr_data = msr->data; 2662 2663 msr->data = data; 2663 2664 if (msr - vmx->guest_msrs < vmx->save_nmsrs) { 2664 2665 preempt_disable(); 2665 - kvm_set_shared_msr(msr->index, msr->data, 2666 - msr->mask); 2666 + ret = kvm_set_shared_msr(msr->index, msr->data, 2667 + msr->mask); 2667 2668 preempt_enable(); 2669 + if (ret) 2670 + msr->data = old_msr_data; 2668 2671 } 2669 2672 break; 2670 2673 } ··· 5294 5291 msr.data = data; 5295 5292 msr.index = ecx; 5296 5293 msr.host_initiated = false; 5297 - if (vmx_set_msr(vcpu, &msr) != 0) { 5294 + if (kvm_set_msr(vcpu, &msr) != 0) { 5298 5295 trace_kvm_msr_write_ex(ecx, data); 5299 5296 kvm_inject_gp(vcpu, 0); 5300 5297 return 1; ··· 6746 6743 return 1; 6747 6744 } 6748 6745 6746 + static int handle_invvpid(struct kvm_vcpu *vcpu) 6747 + { 6748 + kvm_queue_exception(vcpu, UD_VECTOR); 6749 + return 1; 6750 + } 6751 + 6749 6752 /* 6750 6753 * The exit handlers return 1 if the exit was handled fully and guest execution 6751 6754 * may resume. Otherwise they set the kvm_run parameter to indicate what needs ··· 6797 6788 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, 6798 6789 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, 6799 6790 [EXIT_REASON_INVEPT] = handle_invept, 6791 + [EXIT_REASON_INVVPID] = handle_invvpid, 6800 6792 }; 6801 6793 6802 6794 static const int kvm_vmx_max_exit_handlers = ··· 7033 7023 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: 7034 7024 case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: 7035 7025 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 7036 - case EXIT_REASON_INVEPT: 7026 + case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 7037 7027 /* 7038 7028 * VMX instructions trap unconditionally. This allows L1 to 7039 7029 * emulate them for its L2 guest, i.e., allows 3-level nesting! ··· 7174 7164 && kvm_vmx_exit_handlers[exit_reason]) 7175 7165 return kvm_vmx_exit_handlers[exit_reason](vcpu); 7176 7166 else { 7177 - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; 7178 - vcpu->run->hw.hardware_exit_reason = exit_reason; 7167 + WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); 7168 + kvm_queue_exception(vcpu, UD_VECTOR); 7169 + return 1; 7179 7170 } 7180 - return 0; 7181 7171 } 7182 7172 7183 7173 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+34 -4
arch/x86/kvm/x86.c
··· 229 229 shared_msr_update(i, shared_msrs_global.msrs[i]); 230 230 } 231 231 232 - void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) 232 + int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) 233 233 { 234 234 unsigned int cpu = smp_processor_id(); 235 235 struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); 236 + int err; 236 237 237 238 if (((value ^ smsr->values[slot].curr) & mask) == 0) 238 - return; 239 + return 0; 239 240 smsr->values[slot].curr = value; 240 - wrmsrl(shared_msrs_global.msrs[slot], value); 241 + err = wrmsrl_safe(shared_msrs_global.msrs[slot], value); 242 + if (err) 243 + return 1; 244 + 241 245 if (!smsr->registered) { 242 246 smsr->urn.on_user_return = kvm_on_user_return; 243 247 user_return_notifier_register(&smsr->urn); 244 248 smsr->registered = true; 245 249 } 250 + return 0; 246 251 } 247 252 EXPORT_SYMBOL_GPL(kvm_set_shared_msr); 248 253 ··· 992 987 } 993 988 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); 994 989 995 - 996 990 /* 997 991 * Writes msr value into into the appropriate "register". 998 992 * Returns 0 on success, non-0 otherwise. ··· 999 995 */ 1000 996 int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 1001 997 { 998 + switch (msr->index) { 999 + case MSR_FS_BASE: 1000 + case MSR_GS_BASE: 1001 + case MSR_KERNEL_GS_BASE: 1002 + case MSR_CSTAR: 1003 + case MSR_LSTAR: 1004 + if (is_noncanonical_address(msr->data)) 1005 + return 1; 1006 + break; 1007 + case MSR_IA32_SYSENTER_EIP: 1008 + case MSR_IA32_SYSENTER_ESP: 1009 + /* 1010 + * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if 1011 + * non-canonical address is written on Intel but not on 1012 + * AMD (which ignores the top 32-bits, because it does 1013 + * not implement 64-bit SYSENTER). 1014 + * 1015 + * 64-bit code should hence be able to write a non-canonical 1016 + * value on AMD. Making the address canonical ensures that 1017 + * vmentry does not fail on Intel after writing a non-canonical 1018 + * value, and that something deterministic happens if the guest 1019 + * invokes 64-bit SYSENTER. 1020 + */ 1021 + msr->data = get_canonical(msr->data); 1022 + } 1002 1023 return kvm_x86_ops->set_msr(vcpu, msr); 1003 1024 } 1025 + EXPORT_SYMBOL_GPL(kvm_set_msr); 1004 1026 1005 1027 /* 1006 1028 * Adapt set_msr() to msr_io()'s calling convention
+1
include/linux/kvm_host.h
··· 1080 1080 void kvm_device_put(struct kvm_device *dev); 1081 1081 struct kvm_device *kvm_device_from_filp(struct file *filp); 1082 1082 int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type); 1083 + void kvm_unregister_device_ops(u32 type); 1083 1084 1084 1085 extern struct kvm_device_ops kvm_mpic_ops; 1085 1086 extern struct kvm_device_ops kvm_xics_ops;
+4 -4
virt/kvm/iommu.c
··· 43 43 gfn_t base_gfn, unsigned long npages); 44 44 45 45 static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, 46 - unsigned long size) 46 + unsigned long npages) 47 47 { 48 48 gfn_t end_gfn; 49 49 pfn_t pfn; 50 50 51 51 pfn = gfn_to_pfn_memslot(slot, gfn); 52 - end_gfn = gfn + (size >> PAGE_SHIFT); 52 + end_gfn = gfn + npages; 53 53 gfn += 1; 54 54 55 55 if (is_error_noslot_pfn(pfn)) ··· 119 119 * Pin all pages we are about to map in memory. This is 120 120 * important because we unmap and unpin in 4kb steps later. 121 121 */ 122 - pfn = kvm_pin_pages(slot, gfn, page_size); 122 + pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT); 123 123 if (is_error_noslot_pfn(pfn)) { 124 124 gfn += 1; 125 125 continue; ··· 131 131 if (r) { 132 132 printk(KERN_ERR "kvm_iommu_map_address:" 133 133 "iommu failed to map pfn=%llx\n", pfn); 134 - kvm_unpin_pages(kvm, pfn, page_size); 134 + kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT); 135 135 goto unmap_pages; 136 136 } 137 137
+7
virt/kvm/kvm_main.c
··· 2354 2354 return 0; 2355 2355 } 2356 2356 2357 + void kvm_unregister_device_ops(u32 type) 2358 + { 2359 + if (kvm_device_ops_table[type] != NULL) 2360 + kvm_device_ops_table[type] = NULL; 2361 + } 2362 + 2357 2363 static int kvm_ioctl_create_device(struct kvm *kvm, 2358 2364 struct kvm_create_device *cd) 2359 2365 { ··· 3334 3328 kvm_arch_exit(); 3335 3329 kvm_irqfd_exit(); 3336 3330 free_cpumask_var(cpus_hardware_enabled); 3331 + kvm_vfio_ops_exit(); 3337 3332 } 3338 3333 EXPORT_SYMBOL_GPL(kvm_exit);
+5
virt/kvm/vfio.c
··· 283 283 { 284 284 return kvm_register_device_ops(&kvm_vfio_ops, KVM_DEV_TYPE_VFIO); 285 285 } 286 + 287 + void kvm_vfio_ops_exit(void) 288 + { 289 + kvm_unregister_device_ops(KVM_DEV_TYPE_VFIO); 290 + }
+4
virt/kvm/vfio.h
··· 3 3 4 4 #ifdef CONFIG_KVM_VFIO 5 5 int kvm_vfio_ops_init(void); 6 + void kvm_vfio_ops_exit(void); 6 7 #else 7 8 static inline int kvm_vfio_ops_init(void) 8 9 { 9 10 return 0; 11 + } 12 + static inline void kvm_vfio_ops_exit(void) 13 + { 10 14 } 11 15 #endif 12 16