Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'x86-urgent-2020-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Thomas Gleixner:
"A set of x86 and membarrier fixes:

- Correct a few problems in the x86 and the generic membarrier
implementation. Small corrections for assumptions about visibility
which have turned out not to be true.

- Make the PAT bits for memory encryption correct vs 4K and 2M/1G
page table entries as they are at a different location.

- Fix a concurrency issue in the the local bandwidth readout of
resource control leading to incorrect values

- Fix the ordering of allocating a vector for an interrupt. The order
missed to respect the provided cpumask when the first attempt of
allocating node local in the mask fails. It then tries the node
instead of trying the full provided mask first. This leads to
erroneous error messages and breaking the (user) supplied affinity
request. Reorder it.

- Make the INT3 padding detection in optprobe work correctly"

* tag 'x86-urgent-2020-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/kprobes: Fix optprobe to detect INT3 padding correctly
x86/apic/vector: Fix ordering in vector assignment
x86/resctrl: Fix incorrect local bandwidth when mba_sc is enabled
x86/mm/mem_encrypt: Fix definition of PMD_FLAGS_DEC_WP
membarrier: Execute SYNC_CORE on the calling thread
membarrier: Explicitly sync remote cores when SYNC_CORE is requested
membarrier: Add an actual barrier before rseq_preempt()
x86/membarrier: Get rid of a dubious optimization

+111 -42
+1
arch/x86/include/asm/pgtable_types.h
··· 155 155 #define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) 156 156 157 157 #define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT) 158 + #define _PAGE_LARGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE) 158 159 159 160 #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) 160 161 #define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP))
+5 -4
arch/x86/include/asm/sync_core.h
··· 98 98 /* With PTI, we unconditionally serialize before running user code. */ 99 99 if (static_cpu_has(X86_FEATURE_PTI)) 100 100 return; 101 + 101 102 /* 102 - * Return from interrupt and NMI is done through iret, which is core 103 - * serializing. 103 + * Even if we're in an interrupt, we might reschedule before returning, 104 + * in which case we could switch to a different thread in the same mm 105 + * and return using SYSRET or SYSEXIT. Instead of trying to keep 106 + * track of our need to sync the core, just sync right away. 104 107 */ 105 - if (in_irq() || in_nmi()) 106 - return; 107 108 sync_core(); 108 109 } 109 110
+14 -10
arch/x86/kernel/apic/vector.c
··· 273 273 const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); 274 274 int node = irq_data_get_node(irqd); 275 275 276 - if (node == NUMA_NO_NODE) 277 - goto all; 278 - /* Try the intersection of @affmsk and node mask */ 279 - cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk); 280 - if (!assign_vector_locked(irqd, vector_searchmask)) 281 - return 0; 282 - /* Try the node mask */ 283 - if (!assign_vector_locked(irqd, cpumask_of_node(node))) 284 - return 0; 285 - all: 276 + if (node != NUMA_NO_NODE) { 277 + /* Try the intersection of @affmsk and node mask */ 278 + cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk); 279 + if (!assign_vector_locked(irqd, vector_searchmask)) 280 + return 0; 281 + } 282 + 286 283 /* Try the full affinity mask */ 287 284 cpumask_and(vector_searchmask, affmsk, cpu_online_mask); 288 285 if (!assign_vector_locked(irqd, vector_searchmask)) 289 286 return 0; 287 + 288 + if (node != NUMA_NO_NODE) { 289 + /* Try the node mask */ 290 + if (!assign_vector_locked(irqd, cpumask_of_node(node))) 291 + return 0; 292 + } 293 + 290 294 /* Try the full online mask */ 291 295 return assign_vector_locked(irqd, cpu_online_mask); 292 296 }
+2 -4
arch/x86/kernel/cpu/resctrl/monitor.c
··· 279 279 return; 280 280 281 281 chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width); 282 - m->chunks += chunks; 283 282 cur_bw = (chunks * r->mon_scale) >> 20; 284 283 285 284 if (m->delta_comp) ··· 449 450 } 450 451 if (is_mbm_local_enabled()) { 451 452 rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; 453 + __mon_event_count(rmid, &rr); 452 454 453 455 /* 454 456 * Call the MBA software controller only for the 455 457 * control groups and when user has enabled 456 458 * the software controller explicitly. 457 459 */ 458 - if (!is_mba_sc(NULL)) 459 - __mon_event_count(rmid, &rr); 460 - else 460 + if (is_mba_sc(NULL)) 461 461 mbm_bw_count(rmid, &rr); 462 462 } 463 463 }
+20 -2
arch/x86/kernel/kprobes/opt.c
··· 272 272 return ret; 273 273 } 274 274 275 + static bool is_padding_int3(unsigned long addr, unsigned long eaddr) 276 + { 277 + unsigned char ops; 278 + 279 + for (; addr < eaddr; addr++) { 280 + if (get_kernel_nofault(ops, (void *)addr) < 0 || 281 + ops != INT3_INSN_OPCODE) 282 + return false; 283 + } 284 + 285 + return true; 286 + } 287 + 275 288 /* Decode whole function to ensure any instructions don't jump into target */ 276 289 static int can_optimize(unsigned long paddr) 277 290 { ··· 323 310 return 0; 324 311 kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); 325 312 insn_get_length(&insn); 326 - /* Another subsystem puts a breakpoint */ 313 + /* 314 + * In the case of detecting unknown breakpoint, this could be 315 + * a padding INT3 between functions. Let's check that all the 316 + * rest of the bytes are also INT3. 317 + */ 327 318 if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) 328 - return 0; 319 + return is_padding_int3(addr, paddr - offset + size) ? 1 : 0; 320 + 329 321 /* Recover address */ 330 322 insn.kaddr = (void *)addr; 331 323 insn.next_byte = (void *)(addr + insn.length);
+2 -2
arch/x86/mm/mem_encrypt_identity.c
··· 45 45 #define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) 46 46 47 47 #define PMD_FLAGS_DEC PMD_FLAGS_LARGE 48 - #define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ 49 - (_PAGE_PAT | _PAGE_PWT)) 48 + #define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_LARGE_CACHE_MASK) | \ 49 + (_PAGE_PAT_LARGE | _PAGE_PWT)) 50 50 51 51 #define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) 52 52
+8 -2
arch/x86/mm/tlb.c
··· 474 474 /* 475 475 * The membarrier system call requires a full memory barrier and 476 476 * core serialization before returning to user-space, after 477 - * storing to rq->curr. Writing to CR3 provides that full 478 - * memory barrier and core serializing instruction. 477 + * storing to rq->curr, when changing mm. This is because 478 + * membarrier() sends IPIs to all CPUs that are in the target mm 479 + * to make them issue memory barriers. However, if another CPU 480 + * switches to/from the target mm concurrently with 481 + * membarrier(), it can cause that CPU not to receive an IPI 482 + * when it really should issue a memory barrier. Writing to CR3 483 + * provides that full memory barrier and core serializing 484 + * instruction. 479 485 */ 480 486 if (real_prev == next) { 481 487 VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+59 -18
kernel/sched/membarrier.c
··· 38 38 smp_mb(); /* IPIs should be serializing but paranoid. */ 39 39 } 40 40 41 + static void ipi_sync_core(void *info) 42 + { 43 + /* 44 + * The smp_mb() in membarrier after all the IPIs is supposed to 45 + * ensure that memory on remote CPUs that occur before the IPI 46 + * become visible to membarrier()'s caller -- see scenario B in 47 + * the big comment at the top of this file. 48 + * 49 + * A sync_core() would provide this guarantee, but 50 + * sync_core_before_usermode() might end up being deferred until 51 + * after membarrier()'s smp_mb(). 52 + */ 53 + smp_mb(); /* IPIs should be serializing but paranoid. */ 54 + 55 + sync_core_before_usermode(); 56 + } 57 + 41 58 static void ipi_rseq(void *info) 42 59 { 60 + /* 61 + * Ensure that all stores done by the calling thread are visible 62 + * to the current task before the current task resumes. We could 63 + * probably optimize this away on most architectures, but by the 64 + * time we've already sent an IPI, the cost of the extra smp_mb() 65 + * is negligible. 66 + */ 67 + smp_mb(); 43 68 rseq_preempt(current); 44 69 } 45 70 ··· 179 154 if (!(atomic_read(&mm->membarrier_state) & 180 155 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) 181 156 return -EPERM; 157 + ipi_func = ipi_sync_core; 182 158 } else if (flags == MEMBARRIER_FLAG_RSEQ) { 183 159 if (!IS_ENABLED(CONFIG_RSEQ)) 184 160 return -EINVAL; ··· 194 168 return -EPERM; 195 169 } 196 170 197 - if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) 171 + if (flags != MEMBARRIER_FLAG_SYNC_CORE && 172 + (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)) 198 173 return 0; 199 174 200 175 /* ··· 214 187 215 188 if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) 216 189 goto out; 217 - if (cpu_id == raw_smp_processor_id()) 218 - goto out; 219 190 rcu_read_lock(); 220 191 p = rcu_dereference(cpu_rq(cpu_id)->curr); 221 192 if (!p || p->mm != mm) { ··· 228 203 for_each_online_cpu(cpu) { 229 204 struct task_struct *p; 230 205 231 - /* 232 - * Skipping the current CPU is OK even through we can be 233 - * migrated at any point. The current CPU, at the point 234 - * where we read raw_smp_processor_id(), is ensured to 235 - * be in program order with respect to the caller 236 - * thread. Therefore, we can skip this CPU from the 237 - * iteration. 238 - */ 239 - if (cpu == raw_smp_processor_id()) 240 - continue; 241 206 p = rcu_dereference(cpu_rq(cpu)->curr); 242 207 if (p && p->mm == mm) 243 208 __cpumask_set_cpu(cpu, tmpmask); ··· 235 220 rcu_read_unlock(); 236 221 } 237 222 238 - preempt_disable(); 239 - if (cpu_id >= 0) 223 + if (cpu_id >= 0) { 224 + /* 225 + * smp_call_function_single() will call ipi_func() if cpu_id 226 + * is the calling CPU. 227 + */ 240 228 smp_call_function_single(cpu_id, ipi_func, NULL, 1); 241 - else 242 - smp_call_function_many(tmpmask, ipi_func, NULL, 1); 243 - preempt_enable(); 229 + } else { 230 + /* 231 + * For regular membarrier, we can save a few cycles by 232 + * skipping the current cpu -- we're about to do smp_mb() 233 + * below, and if we migrate to a different cpu, this cpu 234 + * and the new cpu will execute a full barrier in the 235 + * scheduler. 236 + * 237 + * For SYNC_CORE, we do need a barrier on the current cpu -- 238 + * otherwise, if we are migrated and replaced by a different 239 + * task in the same mm just before, during, or after 240 + * membarrier, we will end up with some thread in the mm 241 + * running without a core sync. 242 + * 243 + * For RSEQ, don't rseq_preempt() the caller. User code 244 + * is not supposed to issue syscalls at all from inside an 245 + * rseq critical section. 246 + */ 247 + if (flags != MEMBARRIER_FLAG_SYNC_CORE) { 248 + preempt_disable(); 249 + smp_call_function_many(tmpmask, ipi_func, NULL, true); 250 + preempt_enable(); 251 + } else { 252 + on_each_cpu_mask(tmpmask, ipi_func, NULL, true); 253 + } 254 + } 244 255 245 256 out: 246 257 if (cpu_id < 0)