Merge tag 'x86-urgent-2020-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+1

arch/x86/include/asm/pgtable_types.h

··· 155 155 #define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) 156 156 157 157 #define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT) 158 + #define _PAGE_LARGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE) 158 159 159 160 #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) 160 161 #define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP))

+5 -4

arch/x86/include/asm/sync_core.h

··· 98 98 /* With PTI, we unconditionally serialize before running user code. */ 99 99 if (static_cpu_has(X86_FEATURE_PTI)) 100 100 return; 101 + 101 102 /* 102 - * Return from interrupt and NMI is done through iret, which is core 103 - * serializing. 103 + * Even if we're in an interrupt, we might reschedule before returning, 104 + * in which case we could switch to a different thread in the same mm 105 + * and return using SYSRET or SYSEXIT. Instead of trying to keep 106 + * track of our need to sync the core, just sync right away. 104 107 */ 105 - if (in_irq() || in_nmi()) 106 - return; 107 108 sync_core(); 108 109 } 109 110

+14 -10

arch/x86/kernel/apic/vector.c

··· 273 273 const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); 274 274 int node = irq_data_get_node(irqd); 275 275 276 - if (node == NUMA_NO_NODE) 277 - goto all; 278 - /* Try the intersection of @affmsk and node mask */ 279 - cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk); 280 - if (!assign_vector_locked(irqd, vector_searchmask)) 281 - return 0; 282 - /* Try the node mask */ 283 - if (!assign_vector_locked(irqd, cpumask_of_node(node))) 284 - return 0; 285 - all: 276 + if (node != NUMA_NO_NODE) { 277 + /* Try the intersection of @affmsk and node mask */ 278 + cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk); 279 + if (!assign_vector_locked(irqd, vector_searchmask)) 280 + return 0; 281 + } 282 + 286 283 /* Try the full affinity mask */ 287 284 cpumask_and(vector_searchmask, affmsk, cpu_online_mask); 288 285 if (!assign_vector_locked(irqd, vector_searchmask)) 289 286 return 0; 287 + 288 + if (node != NUMA_NO_NODE) { 289 + /* Try the node mask */ 290 + if (!assign_vector_locked(irqd, cpumask_of_node(node))) 291 + return 0; 292 + } 293 + 290 294 /* Try the full online mask */ 291 295 return assign_vector_locked(irqd, cpu_online_mask); 292 296 }

+2 -4

arch/x86/kernel/cpu/resctrl/monitor.c

··· 279 279 return; 280 280 281 281 chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width); 282 - m->chunks += chunks; 283 282 cur_bw = (chunks * r->mon_scale) >> 20; 284 283 285 284 if (m->delta_comp) ··· 449 450 } 450 451 if (is_mbm_local_enabled()) { 451 452 rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; 453 + __mon_event_count(rmid, &rr); 452 454 453 455 /* 454 456 * Call the MBA software controller only for the 455 457 * control groups and when user has enabled 456 458 * the software controller explicitly. 457 459 */ 458 - if (!is_mba_sc(NULL)) 459 - __mon_event_count(rmid, &rr); 460 - else 460 + if (is_mba_sc(NULL)) 461 461 mbm_bw_count(rmid, &rr); 462 462 } 463 463 }

+20 -2

arch/x86/kernel/kprobes/opt.c

··· 272 272 return ret; 273 273 } 274 274 275 + static bool is_padding_int3(unsigned long addr, unsigned long eaddr) 276 + { 277 + unsigned char ops; 278 + 279 + for (; addr < eaddr; addr++) { 280 + if (get_kernel_nofault(ops, (void *)addr) < 0 || 281 + ops != INT3_INSN_OPCODE) 282 + return false; 283 + } 284 + 285 + return true; 286 + } 287 + 275 288 /* Decode whole function to ensure any instructions don't jump into target */ 276 289 static int can_optimize(unsigned long paddr) 277 290 { ··· 323 310 return 0; 324 311 kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); 325 312 insn_get_length(&insn); 326 - /* Another subsystem puts a breakpoint */ 313 + /* 314 + * In the case of detecting unknown breakpoint, this could be 315 + * a padding INT3 between functions. Let's check that all the 316 + * rest of the bytes are also INT3. 317 + */ 327 318 if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) 328 - return 0; 319 + return is_padding_int3(addr, paddr - offset + size) ? 1 : 0; 320 + 329 321 /* Recover address */ 330 322 insn.kaddr = (void *)addr; 331 323 insn.next_byte = (void *)(addr + insn.length);

+2 -2

arch/x86/mm/mem_encrypt_identity.c

··· 45 45 #define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) 46 46 47 47 #define PMD_FLAGS_DEC PMD_FLAGS_LARGE 48 - #define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ 49 - (_PAGE_PAT | _PAGE_PWT)) 48 + #define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_LARGE_CACHE_MASK) | \ 49 + (_PAGE_PAT_LARGE | _PAGE_PWT)) 50 50 51 51 #define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) 52 52

+8 -2

arch/x86/mm/tlb.c

··· 474 474 /* 475 475 * The membarrier system call requires a full memory barrier and 476 476 * core serialization before returning to user-space, after 477 - * storing to rq->curr. Writing to CR3 provides that full 478 - * memory barrier and core serializing instruction. 477 + * storing to rq->curr, when changing mm. This is because 478 + * membarrier() sends IPIs to all CPUs that are in the target mm 479 + * to make them issue memory barriers. However, if another CPU 480 + * switches to/from the target mm concurrently with 481 + * membarrier(), it can cause that CPU not to receive an IPI 482 + * when it really should issue a memory barrier. Writing to CR3 483 + * provides that full memory barrier and core serializing 484 + * instruction. 479 485 */ 480 486 if (real_prev == next) { 481 487 VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=

+59 -18

kernel/sched/membarrier.c

··· 38 38 smp_mb(); /* IPIs should be serializing but paranoid. */ 39 39 } 40 40 41 + static void ipi_sync_core(void *info) 42 + { 43 + /* 44 + * The smp_mb() in membarrier after all the IPIs is supposed to 45 + * ensure that memory on remote CPUs that occur before the IPI 46 + * become visible to membarrier()'s caller -- see scenario B in 47 + * the big comment at the top of this file. 48 + * 49 + * A sync_core() would provide this guarantee, but 50 + * sync_core_before_usermode() might end up being deferred until 51 + * after membarrier()'s smp_mb(). 52 + */ 53 + smp_mb(); /* IPIs should be serializing but paranoid. */ 54 + 55 + sync_core_before_usermode(); 56 + } 57 + 41 58 static void ipi_rseq(void *info) 42 59 { 60 + /* 61 + * Ensure that all stores done by the calling thread are visible 62 + * to the current task before the current task resumes. We could 63 + * probably optimize this away on most architectures, but by the 64 + * time we've already sent an IPI, the cost of the extra smp_mb() 65 + * is negligible. 66 + */ 67 + smp_mb(); 43 68 rseq_preempt(current); 44 69 } 45 70 ··· 179 154 if (!(atomic_read(&mm->membarrier_state) & 180 155 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) 181 156 return -EPERM; 157 + ipi_func = ipi_sync_core; 182 158 } else if (flags == MEMBARRIER_FLAG_RSEQ) { 183 159 if (!IS_ENABLED(CONFIG_RSEQ)) 184 160 return -EINVAL; ··· 194 168 return -EPERM; 195 169 } 196 170 197 - if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) 171 + if (flags != MEMBARRIER_FLAG_SYNC_CORE && 172 + (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)) 198 173 return 0; 199 174 200 175 /* ··· 214 187 215 188 if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) 216 189 goto out; 217 - if (cpu_id == raw_smp_processor_id()) 218 - goto out; 219 190 rcu_read_lock(); 220 191 p = rcu_dereference(cpu_rq(cpu_id)->curr); 221 192 if (!p || p->mm != mm) { ··· 228 203 for_each_online_cpu(cpu) { 229 204 struct task_struct *p; 230 205 231 - /* 232 - * Skipping the current CPU is OK even through we can be 233 - * migrated at any point. The current CPU, at the point 234 - * where we read raw_smp_processor_id(), is ensured to 235 - * be in program order with respect to the caller 236 - * thread. Therefore, we can skip this CPU from the 237 - * iteration. 238 - */ 239 - if (cpu == raw_smp_processor_id()) 240 - continue; 241 206 p = rcu_dereference(cpu_rq(cpu)->curr); 242 207 if (p && p->mm == mm) 243 208 __cpumask_set_cpu(cpu, tmpmask); ··· 235 220 rcu_read_unlock(); 236 221 } 237 222 238 - preempt_disable(); 239 - if (cpu_id >= 0) 223 + if (cpu_id >= 0) { 224 + /* 225 + * smp_call_function_single() will call ipi_func() if cpu_id 226 + * is the calling CPU. 227 + */ 240 228 smp_call_function_single(cpu_id, ipi_func, NULL, 1); 241 - else 242 - smp_call_function_many(tmpmask, ipi_func, NULL, 1); 243 - preempt_enable(); 229 + } else { 230 + /* 231 + * For regular membarrier, we can save a few cycles by 232 + * skipping the current cpu -- we're about to do smp_mb() 233 + * below, and if we migrate to a different cpu, this cpu 234 + * and the new cpu will execute a full barrier in the 235 + * scheduler. 236 + * 237 + * For SYNC_CORE, we do need a barrier on the current cpu -- 238 + * otherwise, if we are migrated and replaced by a different 239 + * task in the same mm just before, during, or after 240 + * membarrier, we will end up with some thread in the mm 241 + * running without a core sync. 242 + * 243 + * For RSEQ, don't rseq_preempt() the caller. User code 244 + * is not supposed to issue syscalls at all from inside an 245 + * rseq critical section. 246 + */ 247 + if (flags != MEMBARRIER_FLAG_SYNC_CORE) { 248 + preempt_disable(); 249 + smp_call_function_many(tmpmask, ipi_func, NULL, true); 250 + preempt_enable(); 251 + } else { 252 + on_each_cpu_mask(tmpmask, ipi_func, NULL, true); 253 + } 254 + } 244 255 245 256 out: 246 257 if (cpu_id < 0)

Configure Feed

Configure Feed