Merge tag 'x86-urgent-2020-09-06' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+20 -9

arch/x86/entry/common.c

··· 60 60 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 61 61 static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs) 62 62 { 63 - unsigned int nr = (unsigned int)regs->orig_ax; 64 - 65 63 if (IS_ENABLED(CONFIG_IA32_EMULATION)) 66 64 current_thread_info()->status |= TS_COMPAT; 67 - /* 68 - * Subtlety here: if ptrace pokes something larger than 2^32-1 into 69 - * orig_ax, the unsigned int return value truncates it. This may 70 - * or may not be necessary, but it matches the old asm behavior. 71 - */ 72 - return (unsigned int)syscall_enter_from_user_mode(regs, nr); 65 + 66 + return (unsigned int)regs->orig_ax; 73 67 } 74 68 75 69 /* ··· 85 91 { 86 92 unsigned int nr = syscall_32_enter(regs); 87 93 94 + /* 95 + * Subtlety here: if ptrace pokes something larger than 2^32-1 into 96 + * orig_ax, the unsigned int return value truncates it. This may 97 + * or may not be necessary, but it matches the old asm behavior. 98 + */ 99 + nr = (unsigned int)syscall_enter_from_user_mode(regs, nr); 100 + 88 101 do_syscall_32_irqs_on(regs, nr); 89 102 syscall_exit_to_user_mode(regs); 90 103 } 91 104 92 105 static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) 93 106 { 94 - unsigned int nr = syscall_32_enter(regs); 107 + unsigned int nr = syscall_32_enter(regs); 95 108 int res; 109 + 110 + /* 111 + * This cannot use syscall_enter_from_user_mode() as it has to 112 + * fetch EBP before invoking any of the syscall entry work 113 + * functions. 114 + */ 115 + syscall_enter_from_user_mode_prepare(regs); 96 116 97 117 instrumentation_begin(); 98 118 /* Fetch EBP from where the vDSO stashed it. */ ··· 129 121 syscall_exit_to_user_mode(regs); 130 122 return false; 131 123 } 124 + 125 + /* The case truncates any ptrace induced syscall nr > 2^32 -1 */ 126 + nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr); 132 127 133 128 /* Now this is just like a normal syscall. */ 134 129 do_syscall_32_irqs_on(regs, nr);

+10 -2

arch/x86/include/asm/entry-common.h

··· 18 18 * state, not the interrupt state as imagined by Xen. 19 19 */ 20 20 unsigned long flags = native_save_fl(); 21 - WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF | 22 - X86_EFLAGS_NT)); 21 + unsigned long mask = X86_EFLAGS_DF | X86_EFLAGS_NT; 22 + 23 + /* 24 + * For !SMAP hardware we patch out CLAC on entry. 25 + */ 26 + if (boot_cpu_has(X86_FEATURE_SMAP) || 27 + (IS_ENABLED(CONFIG_64_BIT) && boot_cpu_has(X86_FEATURE_XENPV))) 28 + mask |= X86_EFLAGS_AC; 29 + 30 + WARN_ON_ONCE(flags & mask); 23 31 24 32 /* We think we came from user mode. Make sure pt_regs agrees. */ 25 33 WARN_ON_ONCE(!user_mode(regs));

+1 -1

arch/x86/include/asm/ptrace.h

··· 327 327 static const unsigned int argument_offs[] = { 328 328 #ifdef __i386__ 329 329 offsetof(struct pt_regs, ax), 330 - offsetof(struct pt_regs, cx), 331 330 offsetof(struct pt_regs, dx), 331 + offsetof(struct pt_regs, cx), 332 332 #define NR_REG_ARGUMENTS 3 333 333 #else 334 334 offsetof(struct pt_regs, di),

+31 -34

arch/x86/kernel/traps.c

··· 729 729 #endif 730 730 } 731 731 732 - static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7) 732 + static __always_inline unsigned long debug_read_clear_dr6(void) 733 733 { 734 - /* 735 - * Disable breakpoints during exception handling; recursive exceptions 736 - * are exceedingly 'fun'. 737 - * 738 - * Since this function is NOKPROBE, and that also applies to 739 - * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a 740 - * HW_BREAKPOINT_W on our stack) 741 - * 742 - * Entry text is excluded for HW_BP_X and cpu_entry_area, which 743 - * includes the entry stack is excluded for everything. 744 - */ 745 - *dr7 = local_db_save(); 734 + unsigned long dr6; 746 735 747 736 /* 748 737 * The Intel SDM says: ··· 744 755 * 745 756 * Keep it simple: clear DR6 immediately. 746 757 */ 747 - get_debugreg(*dr6, 6); 758 + get_debugreg(dr6, 6); 748 759 set_debugreg(0, 6); 749 760 /* Filter out all the reserved bits which are preset to 1 */ 750 - *dr6 &= ~DR6_RESERVED; 751 - } 761 + dr6 &= ~DR6_RESERVED; 752 762 753 - static __always_inline void debug_exit(unsigned long dr7) 754 - { 755 - local_db_restore(dr7); 763 + return dr6; 756 764 } 757 765 758 766 /* ··· 849 863 static __always_inline void exc_debug_kernel(struct pt_regs *regs, 850 864 unsigned long dr6) 851 865 { 866 + /* 867 + * Disable breakpoints during exception handling; recursive exceptions 868 + * are exceedingly 'fun'. 869 + * 870 + * Since this function is NOKPROBE, and that also applies to 871 + * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a 872 + * HW_BREAKPOINT_W on our stack) 873 + * 874 + * Entry text is excluded for HW_BP_X and cpu_entry_area, which 875 + * includes the entry stack is excluded for everything. 876 + */ 877 + unsigned long dr7 = local_db_save(); 852 878 bool irq_state = idtentry_enter_nmi(regs); 853 879 instrumentation_begin(); 854 880 ··· 881 883 882 884 instrumentation_end(); 883 885 idtentry_exit_nmi(regs, irq_state); 886 + 887 + local_db_restore(dr7); 884 888 } 885 889 886 890 static __always_inline void exc_debug_user(struct pt_regs *regs, ··· 893 893 * #DB, we will malfunction. 894 894 */ 895 895 WARN_ON_ONCE(!user_mode(regs)); 896 + 897 + /* 898 + * NB: We can't easily clear DR7 here because 899 + * idtentry_exit_to_usermode() can invoke ptrace, schedule, access 900 + * user memory, etc. This means that a recursive #DB is possible. If 901 + * this happens, that #DB will hit exc_debug_kernel() and clear DR7. 902 + * Since we're not on the IST stack right now, everything will be 903 + * fine. 904 + */ 896 905 897 906 irqentry_enter_from_user_mode(regs); 898 907 instrumentation_begin(); ··· 916 907 /* IST stack entry */ 917 908 DEFINE_IDTENTRY_DEBUG(exc_debug) 918 909 { 919 - unsigned long dr6, dr7; 920 - 921 - debug_enter(&dr6, &dr7); 922 - exc_debug_kernel(regs, dr6); 923 - debug_exit(dr7); 910 + exc_debug_kernel(regs, debug_read_clear_dr6()); 924 911 } 925 912 926 913 /* User entry, runs on regular task stack */ 927 914 DEFINE_IDTENTRY_DEBUG_USER(exc_debug) 928 915 { 929 - unsigned long dr6, dr7; 930 - 931 - debug_enter(&dr6, &dr7); 932 - exc_debug_user(regs, dr6); 933 - debug_exit(dr7); 916 + exc_debug_user(regs, debug_read_clear_dr6()); 934 917 } 935 918 #else 936 919 /* 32 bit does not have separate entry points. */ 937 920 DEFINE_IDTENTRY_RAW(exc_debug) 938 921 { 939 - unsigned long dr6, dr7; 940 - 941 - debug_enter(&dr6, &dr7); 922 + unsigned long dr6 = debug_read_clear_dr6(); 942 923 943 924 if (user_mode(regs)) 944 925 exc_debug_user(regs, dr6); 945 926 else 946 927 exc_debug_kernel(regs, dr6); 947 - 948 - debug_exit(dr7); 949 928 } 950 929 #endif 951 930

+1 -1

arch/x86/lib/Makefile

··· 24 24 CFLAGS_REMOVE_cmdline.o = -pg 25 25 endif 26 26 27 - CFLAGS_cmdline.o := -fno-stack-protector 27 + CFLAGS_cmdline.o := -fno-stack-protector -fno-jump-tables 28 28 endif 29 29 30 30 inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk

+78

arch/x86/mm/fault.c

··· 190 190 return pmd_k; 191 191 } 192 192 193 + /* 194 + * Handle a fault on the vmalloc or module mapping area 195 + * 196 + * This is needed because there is a race condition between the time 197 + * when the vmalloc mapping code updates the PMD to the point in time 198 + * where it synchronizes this update with the other page-tables in the 199 + * system. 200 + * 201 + * In this race window another thread/CPU can map an area on the same 202 + * PMD, finds it already present and does not synchronize it with the 203 + * rest of the system yet. As a result v[mz]alloc might return areas 204 + * which are not mapped in every page-table in the system, causing an 205 + * unhandled page-fault when they are accessed. 206 + */ 207 + static noinline int vmalloc_fault(unsigned long address) 208 + { 209 + unsigned long pgd_paddr; 210 + pmd_t *pmd_k; 211 + pte_t *pte_k; 212 + 213 + /* Make sure we are in vmalloc area: */ 214 + if (!(address >= VMALLOC_START && address < VMALLOC_END)) 215 + return -1; 216 + 217 + /* 218 + * Synchronize this task's top level page-table 219 + * with the 'reference' page table. 220 + * 221 + * Do _not_ use "current" here. We might be inside 222 + * an interrupt in the middle of a task switch.. 223 + */ 224 + pgd_paddr = read_cr3_pa(); 225 + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); 226 + if (!pmd_k) 227 + return -1; 228 + 229 + if (pmd_large(*pmd_k)) 230 + return 0; 231 + 232 + pte_k = pte_offset_kernel(pmd_k, address); 233 + if (!pte_present(*pte_k)) 234 + return -1; 235 + 236 + return 0; 237 + } 238 + NOKPROBE_SYMBOL(vmalloc_fault); 239 + 193 240 void arch_sync_kernel_mappings(unsigned long start, unsigned long end) 194 241 { 195 242 unsigned long addr; ··· 1156 1109 * space, so do not expect them here. 1157 1110 */ 1158 1111 WARN_ON_ONCE(hw_error_code & X86_PF_PK); 1112 + 1113 + #ifdef CONFIG_X86_32 1114 + /* 1115 + * We can fault-in kernel-space virtual memory on-demand. The 1116 + * 'reference' page table is init_mm.pgd. 1117 + * 1118 + * NOTE! We MUST NOT take any locks for this case. We may 1119 + * be in an interrupt or a critical region, and should 1120 + * only copy the information from the master page table, 1121 + * nothing more. 1122 + * 1123 + * Before doing this on-demand faulting, ensure that the 1124 + * fault is not any of the following: 1125 + * 1. A fault on a PTE with a reserved bit set. 1126 + * 2. A fault caused by a user-mode access. (Do not demand- 1127 + * fault kernel memory due to user-mode accesses). 1128 + * 3. A fault caused by a page-level protection violation. 1129 + * (A demand fault would be on a non-present page which 1130 + * would have X86_PF_PROT==0). 1131 + * 1132 + * This is only needed to close a race condition on x86-32 in 1133 + * the vmalloc mapping/unmapping code. See the comment above 1134 + * vmalloc_fault() for details. On x86-64 the race does not 1135 + * exist as the vmalloc mappings don't need to be synchronized 1136 + * there. 1137 + */ 1138 + if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { 1139 + if (vmalloc_fault(address) >= 0) 1140 + return; 1141 + } 1142 + #endif 1159 1143 1160 1144 /* Was the fault spurious, caused by lazy TLB invalidation? */ 1161 1145 if (spurious_kernel_fault(hw_error_code, address))

+1 -1

arch/x86/mm/numa_emulation.c

··· 321 321 u64 addr, u64 max_addr, u64 size) 322 322 { 323 323 return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, 324 - 0, NULL, NUMA_NO_NODE); 324 + 0, NULL, 0); 325 325 } 326 326 327 327 static int __init setup_emu2phys_nid(int *dfl_phys_nid)

+42 -9

include/linux/entry-common.h

··· 110 110 #endif 111 111 112 112 /** 113 - * syscall_enter_from_user_mode - Check and handle work before invoking 114 - * a syscall 113 + * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts 114 + * @regs: Pointer to currents pt_regs 115 + * 116 + * Invoked from architecture specific syscall entry code with interrupts 117 + * disabled. The calling code has to be non-instrumentable. When the 118 + * function returns all state is correct, interrupts are enabled and the 119 + * subsequent functions can be instrumented. 120 + * 121 + * This handles lockdep, RCU (context tracking) and tracing state. 122 + * 123 + * This is invoked when there is extra architecture specific functionality 124 + * to be done between establishing state and handling user mode entry work. 125 + */ 126 + void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); 127 + 128 + /** 129 + * syscall_enter_from_user_mode_work - Check and handle work before invoking 130 + * a syscall 115 131 * @regs: Pointer to currents pt_regs 116 132 * @syscall: The syscall number 117 133 * 118 134 * Invoked from architecture specific syscall entry code with interrupts 119 - * disabled. The calling code has to be non-instrumentable. When the 120 - * function returns all state is correct and the subsequent functions can be 121 - * instrumented. 135 + * enabled after invoking syscall_enter_from_user_mode_prepare() and extra 136 + * architecture specific work. 122 137 * 123 138 * Returns: The original or a modified syscall number 124 139 * ··· 142 127 * syscall_set_return_value() first. If neither of those are called and -1 143 128 * is returned, then the syscall will fail with ENOSYS. 144 129 * 145 - * The following functionality is handled here: 130 + * It handles the following work items: 146 131 * 147 - * 1) Establish state (lockdep, RCU (context tracking), tracing) 148 - * 2) TIF flag dependent invocations of arch_syscall_enter_tracehook(), 132 + * 1) TIF flag dependent invocations of arch_syscall_enter_tracehook(), 149 133 * __secure_computing(), trace_sys_enter() 150 - * 3) Invocation of audit_syscall_entry() 134 + * 2) Invocation of audit_syscall_entry() 135 + */ 136 + long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall); 137 + 138 + /** 139 + * syscall_enter_from_user_mode - Establish state and check and handle work 140 + * before invoking a syscall 141 + * @regs: Pointer to currents pt_regs 142 + * @syscall: The syscall number 143 + * 144 + * Invoked from architecture specific syscall entry code with interrupts 145 + * disabled. The calling code has to be non-instrumentable. When the 146 + * function returns all state is correct, interrupts are enabled and the 147 + * subsequent functions can be instrumented. 148 + * 149 + * This is combination of syscall_enter_from_user_mode_prepare() and 150 + * syscall_enter_from_user_mode_work(). 151 + * 152 + * Returns: The original or a modified syscall number. See 153 + * syscall_enter_from_user_mode_work() for further explanation. 151 154 */ 152 155 long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall); 153 156

+29 -6

kernel/entry/common.c

··· 69 69 return ret ? : syscall_get_nr(current, regs); 70 70 } 71 71 72 - noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) 72 + static __always_inline long 73 + __syscall_enter_from_user_work(struct pt_regs *regs, long syscall) 73 74 { 74 75 unsigned long ti_work; 75 76 76 - enter_from_user_mode(regs); 77 - instrumentation_begin(); 78 - 79 - local_irq_enable(); 80 77 ti_work = READ_ONCE(current_thread_info()->flags); 81 78 if (ti_work & SYSCALL_ENTER_WORK) 82 79 syscall = syscall_trace_enter(regs, syscall, ti_work); 83 - instrumentation_end(); 84 80 85 81 return syscall; 82 + } 83 + 84 + long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) 85 + { 86 + return __syscall_enter_from_user_work(regs, syscall); 87 + } 88 + 89 + noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) 90 + { 91 + long ret; 92 + 93 + enter_from_user_mode(regs); 94 + 95 + instrumentation_begin(); 96 + local_irq_enable(); 97 + ret = __syscall_enter_from_user_work(regs, syscall); 98 + instrumentation_end(); 99 + 100 + return ret; 101 + } 102 + 103 + noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) 104 + { 105 + enter_from_user_mode(regs); 106 + instrumentation_begin(); 107 + local_irq_enable(); 108 + instrumentation_end(); 86 109 } 87 110 88 111 /**

Configure Feed

Configure Feed