Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86/pti updates from Thomas Gleixner:
"A mixed bag of fixes and updates for the ghosts which are hunting us.

The scheduler fixes have been pulled into that branch to avoid
conflicts.

- A set of fixes to address a khread_parkme() race which caused lost
wakeups and loss of state.

- A deadlock fix for stop_machine() solved by moving the wakeups
outside of the stopper_lock held region.

- A set of Spectre V1 array access restrictions. The possible
problematic spots were discuvered by Dan Carpenters new checks in
smatch.

- Removal of an unused file which was forgotten when the rest of that
functionality was removed"

* 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/vdso: Remove unused file
perf/x86/cstate: Fix possible Spectre-v1 indexing for pkg_msr
perf/x86/msr: Fix possible Spectre-v1 indexing in the MSR driver
perf/x86: Fix possible Spectre-v1 indexing for x86_pmu::event_map()
perf/x86: Fix possible Spectre-v1 indexing for hw_perf_event cache_*
perf/core: Fix possible Spectre-v1 indexing for ->aux_pages[]
sched/autogroup: Fix possible Spectre-v1 indexing for sched_prio_to_weight[]
sched/core: Fix possible Spectre-v1 indexing for sched_prio_to_weight[]
sched/core: Introduce set_special_state()
kthread, sched/wait: Fix kthread_parkme() completion issue
kthread, sched/wait: Fix kthread_parkme() wait-loop
sched/fair: Fix the update of blocked load when newly idle
stop_machine, sched: Fix migrate_swap() vs. active_balance() deadlock

+153 -78
-1
arch/x86/entry/vdso/vdso32/vdso-fakesections.c
··· 1 - #include "../vdso-fakesections.c"
+7 -1
arch/x86/events/core.c
··· 27 27 #include <linux/cpu.h> 28 28 #include <linux/bitops.h> 29 29 #include <linux/device.h> 30 + #include <linux/nospec.h> 30 31 31 32 #include <asm/apic.h> 32 33 #include <asm/stacktrace.h> ··· 305 304 306 305 config = attr->config; 307 306 308 - cache_type = (config >> 0) & 0xff; 307 + cache_type = (config >> 0) & 0xff; 309 308 if (cache_type >= PERF_COUNT_HW_CACHE_MAX) 310 309 return -EINVAL; 310 + cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX); 311 311 312 312 cache_op = (config >> 8) & 0xff; 313 313 if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) 314 314 return -EINVAL; 315 + cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX); 315 316 316 317 cache_result = (config >> 16) & 0xff; 317 318 if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) 318 319 return -EINVAL; 320 + cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX); 319 321 320 322 val = hw_cache_event_ids[cache_type][cache_op][cache_result]; 321 323 ··· 424 420 425 421 if (attr->config >= x86_pmu.max_events) 426 422 return -EINVAL; 423 + 424 + attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events); 427 425 428 426 /* 429 427 * The generic map:
+2
arch/x86/events/intel/cstate.c
··· 92 92 #include <linux/module.h> 93 93 #include <linux/slab.h> 94 94 #include <linux/perf_event.h> 95 + #include <linux/nospec.h> 95 96 #include <asm/cpu_device_id.h> 96 97 #include <asm/intel-family.h> 97 98 #include "../perf_event.h" ··· 303 302 } else if (event->pmu == &cstate_pkg_pmu) { 304 303 if (cfg >= PERF_CSTATE_PKG_EVENT_MAX) 305 304 return -EINVAL; 305 + cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX); 306 306 if (!pkg_msr[cfg].attr) 307 307 return -EINVAL; 308 308 event->hw.event_base = pkg_msr[cfg].msr;
+6 -3
arch/x86/events/msr.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 #include <linux/perf_event.h> 3 + #include <linux/nospec.h> 3 4 #include <asm/intel-family.h> 4 5 5 6 enum perf_msr_id { ··· 159 158 if (event->attr.type != event->pmu->type) 160 159 return -ENOENT; 161 160 162 - if (cfg >= PERF_MSR_EVENT_MAX) 163 - return -EINVAL; 164 - 165 161 /* unsupported modes and filters */ 166 162 if (event->attr.exclude_user || 167 163 event->attr.exclude_kernel || ··· 168 170 event->attr.exclude_guest || 169 171 event->attr.sample_period) /* no sampling */ 170 172 return -EINVAL; 173 + 174 + if (cfg >= PERF_MSR_EVENT_MAX) 175 + return -EINVAL; 176 + 177 + cfg = array_index_nospec((unsigned long)cfg, PERF_MSR_EVENT_MAX); 171 178 172 179 if (!msr[cfg].attr) 173 180 return -EINVAL;
+1
include/linux/kthread.h
··· 62 62 int kthread_park(struct task_struct *k); 63 63 void kthread_unpark(struct task_struct *k); 64 64 void kthread_parkme(void); 65 + void kthread_park_complete(struct task_struct *k); 65 66 66 67 int kthreadd(void *unused); 67 68 extern struct task_struct *kthreadd_task;
+45 -5
include/linux/sched.h
··· 112 112 113 113 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 114 114 115 + /* 116 + * Special states are those that do not use the normal wait-loop pattern. See 117 + * the comment with set_special_state(). 118 + */ 119 + #define is_special_task_state(state) \ 120 + ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_DEAD)) 121 + 115 122 #define __set_current_state(state_value) \ 116 123 do { \ 124 + WARN_ON_ONCE(is_special_task_state(state_value));\ 117 125 current->task_state_change = _THIS_IP_; \ 118 126 current->state = (state_value); \ 119 127 } while (0) 128 + 120 129 #define set_current_state(state_value) \ 121 130 do { \ 131 + WARN_ON_ONCE(is_special_task_state(state_value));\ 122 132 current->task_state_change = _THIS_IP_; \ 123 133 smp_store_mb(current->state, (state_value)); \ 124 134 } while (0) 125 135 136 + #define set_special_state(state_value) \ 137 + do { \ 138 + unsigned long flags; /* may shadow */ \ 139 + WARN_ON_ONCE(!is_special_task_state(state_value)); \ 140 + raw_spin_lock_irqsave(&current->pi_lock, flags); \ 141 + current->task_state_change = _THIS_IP_; \ 142 + current->state = (state_value); \ 143 + raw_spin_unlock_irqrestore(&current->pi_lock, flags); \ 144 + } while (0) 126 145 #else 127 146 /* 128 147 * set_current_state() includes a barrier so that the write of current->state ··· 163 144 * 164 145 * The above is typically ordered against the wakeup, which does: 165 146 * 166 - * need_sleep = false; 167 - * wake_up_state(p, TASK_UNINTERRUPTIBLE); 147 + * need_sleep = false; 148 + * wake_up_state(p, TASK_UNINTERRUPTIBLE); 168 149 * 169 150 * Where wake_up_state() (and all other wakeup primitives) imply enough 170 151 * barriers to order the store of the variable against wakeup. ··· 173 154 * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a 174 155 * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). 175 156 * 176 - * This is obviously fine, since they both store the exact same value. 157 + * However, with slightly different timing the wakeup TASK_RUNNING store can 158 + * also collide with the TASK_UNINTERRUPTIBLE store. Loosing that store is not 159 + * a problem either because that will result in one extra go around the loop 160 + * and our @cond test will save the day. 177 161 * 178 162 * Also see the comments of try_to_wake_up(). 179 163 */ 180 - #define __set_current_state(state_value) do { current->state = (state_value); } while (0) 181 - #define set_current_state(state_value) smp_store_mb(current->state, (state_value)) 164 + #define __set_current_state(state_value) \ 165 + current->state = (state_value) 166 + 167 + #define set_current_state(state_value) \ 168 + smp_store_mb(current->state, (state_value)) 169 + 170 + /* 171 + * set_special_state() should be used for those states when the blocking task 172 + * can not use the regular condition based wait-loop. In that case we must 173 + * serialize against wakeups such that any possible in-flight TASK_RUNNING stores 174 + * will not collide with our state change. 175 + */ 176 + #define set_special_state(state_value) \ 177 + do { \ 178 + unsigned long flags; /* may shadow */ \ 179 + raw_spin_lock_irqsave(&current->pi_lock, flags); \ 180 + current->state = (state_value); \ 181 + raw_spin_unlock_irqrestore(&current->pi_lock, flags); \ 182 + } while (0) 183 + 182 184 #endif 183 185 184 186 /* Task command name length: */
+1 -1
include/linux/sched/signal.h
··· 280 280 { 281 281 spin_lock_irq(&current->sighand->siglock); 282 282 if (current->jobctl & JOBCTL_STOP_DEQUEUED) 283 - __set_current_state(TASK_STOPPED); 283 + set_special_state(TASK_STOPPED); 284 284 spin_unlock_irq(&current->sighand->siglock); 285 285 286 286 schedule();
+5 -2
kernel/events/ring_buffer.c
··· 14 14 #include <linux/slab.h> 15 15 #include <linux/circ_buf.h> 16 16 #include <linux/poll.h> 17 + #include <linux/nospec.h> 17 18 18 19 #include "internal.h" 19 20 ··· 868 867 return NULL; 869 868 870 869 /* AUX space */ 871 - if (pgoff >= rb->aux_pgoff) 872 - return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]); 870 + if (pgoff >= rb->aux_pgoff) { 871 + int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages); 872 + return virt_to_page(rb->aux_pages[aux_pgoff]); 873 + } 873 874 } 874 875 875 876 return __perf_mmap_to_page(rb, pgoff);
+23 -27
kernel/kthread.c
··· 55 55 KTHREAD_IS_PER_CPU = 0, 56 56 KTHREAD_SHOULD_STOP, 57 57 KTHREAD_SHOULD_PARK, 58 - KTHREAD_IS_PARKED, 59 58 }; 60 59 61 60 static inline void set_kthread_struct(void *kthread) ··· 176 177 177 178 static void __kthread_parkme(struct kthread *self) 178 179 { 179 - __set_current_state(TASK_PARKED); 180 - while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { 181 - if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) 182 - complete(&self->parked); 180 + for (;;) { 181 + set_current_state(TASK_PARKED); 182 + if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) 183 + break; 183 184 schedule(); 184 - __set_current_state(TASK_PARKED); 185 185 } 186 - clear_bit(KTHREAD_IS_PARKED, &self->flags); 187 186 __set_current_state(TASK_RUNNING); 188 187 } 189 188 ··· 190 193 __kthread_parkme(to_kthread(current)); 191 194 } 192 195 EXPORT_SYMBOL_GPL(kthread_parkme); 196 + 197 + void kthread_park_complete(struct task_struct *k) 198 + { 199 + complete(&to_kthread(k)->parked); 200 + } 193 201 194 202 static int kthread(void *_create) 195 203 { ··· 452 450 { 453 451 struct kthread *kthread = to_kthread(k); 454 452 455 - clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 456 453 /* 457 - * We clear the IS_PARKED bit here as we don't wait 458 - * until the task has left the park code. So if we'd 459 - * park before that happens we'd see the IS_PARKED bit 460 - * which might be about to be cleared. 454 + * Newly created kthread was parked when the CPU was offline. 455 + * The binding was lost and we need to set it again. 461 456 */ 462 - if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { 463 - /* 464 - * Newly created kthread was parked when the CPU was offline. 465 - * The binding was lost and we need to set it again. 466 - */ 467 - if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) 468 - __kthread_bind(k, kthread->cpu, TASK_PARKED); 469 - wake_up_state(k, TASK_PARKED); 470 - } 457 + if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) 458 + __kthread_bind(k, kthread->cpu, TASK_PARKED); 459 + 460 + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 461 + wake_up_state(k, TASK_PARKED); 471 462 } 472 463 EXPORT_SYMBOL_GPL(kthread_unpark); 473 464 ··· 483 488 if (WARN_ON(k->flags & PF_EXITING)) 484 489 return -ENOSYS; 485 490 486 - if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { 487 - set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 488 - if (k != current) { 489 - wake_up_process(k); 490 - wait_for_completion(&kthread->parked); 491 - } 491 + if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) 492 + return -EBUSY; 493 + 494 + set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 495 + if (k != current) { 496 + wake_up_process(k); 497 + wait_for_completion(&kthread->parked); 492 498 } 493 499 494 500 return 0;
+5 -2
kernel/sched/autogroup.c
··· 2 2 /* 3 3 * Auto-group scheduling implementation: 4 4 */ 5 + #include <linux/nospec.h> 5 6 #include "sched.h" 6 7 7 8 unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; ··· 210 209 static unsigned long next = INITIAL_JIFFIES; 211 210 struct autogroup *ag; 212 211 unsigned long shares; 213 - int err; 212 + int err, idx; 214 213 215 214 if (nice < MIN_NICE || nice > MAX_NICE) 216 215 return -EINVAL; ··· 228 227 229 228 next = HZ / 10 + jiffies; 230 229 ag = autogroup_task_get(p); 231 - shares = scale_load(sched_prio_to_weight[nice + 20]); 230 + 231 + idx = array_index_nospec(nice + 20, 40); 232 + shares = scale_load(sched_prio_to_weight[idx]); 232 233 233 234 down_write(&ag->lock); 234 235 err = sched_group_set_shares(ag->tg, shares);
+28 -28
kernel/sched/core.c
··· 7 7 */ 8 8 #include "sched.h" 9 9 10 + #include <linux/kthread.h> 11 + #include <linux/nospec.h> 12 + 10 13 #include <asm/switch_to.h> 11 14 #include <asm/tlb.h> 12 15 ··· 2721 2718 membarrier_mm_sync_core_before_usermode(mm); 2722 2719 mmdrop(mm); 2723 2720 } 2724 - if (unlikely(prev_state == TASK_DEAD)) { 2725 - if (prev->sched_class->task_dead) 2726 - prev->sched_class->task_dead(prev); 2721 + if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) { 2722 + switch (prev_state) { 2723 + case TASK_DEAD: 2724 + if (prev->sched_class->task_dead) 2725 + prev->sched_class->task_dead(prev); 2727 2726 2728 - /* 2729 - * Remove function-return probe instances associated with this 2730 - * task and put them back on the free list. 2731 - */ 2732 - kprobe_flush_task(prev); 2727 + /* 2728 + * Remove function-return probe instances associated with this 2729 + * task and put them back on the free list. 2730 + */ 2731 + kprobe_flush_task(prev); 2733 2732 2734 - /* Task is done with its stack. */ 2735 - put_task_stack(prev); 2733 + /* Task is done with its stack. */ 2734 + put_task_stack(prev); 2736 2735 2737 - put_task_struct(prev); 2736 + put_task_struct(prev); 2737 + break; 2738 + 2739 + case TASK_PARKED: 2740 + kthread_park_complete(prev); 2741 + break; 2742 + } 2738 2743 } 2739 2744 2740 2745 tick_nohz_task_switch(); ··· 3509 3498 3510 3499 void __noreturn do_task_dead(void) 3511 3500 { 3512 - /* 3513 - * The setting of TASK_RUNNING by try_to_wake_up() may be delayed 3514 - * when the following two conditions become true. 3515 - * - There is race condition of mmap_sem (It is acquired by 3516 - * exit_mm()), and 3517 - * - SMI occurs before setting TASK_RUNINNG. 3518 - * (or hypervisor of virtual machine switches to other guest) 3519 - * As a result, we may become TASK_RUNNING after becoming TASK_DEAD 3520 - * 3521 - * To avoid it, we have to wait for releasing tsk->pi_lock which 3522 - * is held by try_to_wake_up() 3523 - */ 3524 - raw_spin_lock_irq(&current->pi_lock); 3525 - raw_spin_unlock_irq(&current->pi_lock); 3526 - 3527 3501 /* Causes final put_task_struct in finish_task_switch(): */ 3528 - __set_current_state(TASK_DEAD); 3502 + set_special_state(TASK_DEAD); 3529 3503 3530 3504 /* Tell freezer to ignore us: */ 3531 3505 current->flags |= PF_NOFREEZE; ··· 6924 6928 struct cftype *cft, s64 nice) 6925 6929 { 6926 6930 unsigned long weight; 6931 + int idx; 6927 6932 6928 6933 if (nice < MIN_NICE || nice > MAX_NICE) 6929 6934 return -ERANGE; 6930 6935 6931 - weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO]; 6936 + idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO; 6937 + idx = array_index_nospec(idx, 40); 6938 + weight = sched_prio_to_weight[idx]; 6939 + 6932 6940 return sched_group_set_shares(css_tg(css), scale_load(weight)); 6933 6941 } 6934 6942 #endif
+1 -1
kernel/sched/fair.c
··· 9792 9792 if (curr_cost > this_rq->max_idle_balance_cost) 9793 9793 this_rq->max_idle_balance_cost = curr_cost; 9794 9794 9795 + out: 9795 9796 /* 9796 9797 * While browsing the domains, we released the rq lock, a task could 9797 9798 * have been enqueued in the meantime. Since we're not going idle, ··· 9801 9800 if (this_rq->cfs.h_nr_running && !pulled_task) 9802 9801 pulled_task = 1; 9803 9802 9804 - out: 9805 9803 /* Move the next balance forward */ 9806 9804 if (time_after(this_rq->next_balance, next_balance)) 9807 9805 this_rq->next_balance = next_balance;
+15 -2
kernel/signal.c
··· 1961 1961 return; 1962 1962 } 1963 1963 1964 + set_special_state(TASK_TRACED); 1965 + 1964 1966 /* 1965 1967 * We're committing to trapping. TRACED should be visible before 1966 1968 * TRAPPING is cleared; otherwise, the tracer might fail do_wait(). 1967 1969 * Also, transition to TRACED and updates to ->jobctl should be 1968 1970 * atomic with respect to siglock and should be done after the arch 1969 1971 * hook as siglock is released and regrabbed across it. 1972 + * 1973 + * TRACER TRACEE 1974 + * 1975 + * ptrace_attach() 1976 + * [L] wait_on_bit(JOBCTL_TRAPPING) [S] set_special_state(TRACED) 1977 + * do_wait() 1978 + * set_current_state() smp_wmb(); 1979 + * ptrace_do_wait() 1980 + * wait_task_stopped() 1981 + * task_stopped_code() 1982 + * [L] task_is_traced() [S] task_clear_jobctl_trapping(); 1970 1983 */ 1971 - set_current_state(TASK_TRACED); 1984 + smp_wmb(); 1972 1985 1973 1986 current->last_siginfo = info; 1974 1987 current->exit_code = exit_code; ··· 2189 2176 if (task_participate_group_stop(current)) 2190 2177 notify = CLD_STOPPED; 2191 2178 2192 - __set_current_state(TASK_STOPPED); 2179 + set_special_state(TASK_STOPPED); 2193 2180 spin_unlock_irq(&current->sighand->siglock); 2194 2181 2195 2182 /*
+14 -5
kernel/stop_machine.c
··· 21 21 #include <linux/smpboot.h> 22 22 #include <linux/atomic.h> 23 23 #include <linux/nmi.h> 24 + #include <linux/sched/wake_q.h> 24 25 25 26 /* 26 27 * Structure to determine completion condition and record errors. May ··· 66 65 } 67 66 68 67 static void __cpu_stop_queue_work(struct cpu_stopper *stopper, 69 - struct cpu_stop_work *work) 68 + struct cpu_stop_work *work, 69 + struct wake_q_head *wakeq) 70 70 { 71 71 list_add_tail(&work->list, &stopper->works); 72 - wake_up_process(stopper->thread); 72 + wake_q_add(wakeq, stopper->thread); 73 73 } 74 74 75 75 /* queue @work to @stopper. if offline, @work is completed immediately */ 76 76 static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) 77 77 { 78 78 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 79 + DEFINE_WAKE_Q(wakeq); 79 80 unsigned long flags; 80 81 bool enabled; 81 82 82 83 spin_lock_irqsave(&stopper->lock, flags); 83 84 enabled = stopper->enabled; 84 85 if (enabled) 85 - __cpu_stop_queue_work(stopper, work); 86 + __cpu_stop_queue_work(stopper, work, &wakeq); 86 87 else if (work->done) 87 88 cpu_stop_signal_done(work->done); 88 89 spin_unlock_irqrestore(&stopper->lock, flags); 90 + 91 + wake_up_q(&wakeq); 89 92 90 93 return enabled; 91 94 } ··· 234 229 { 235 230 struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); 236 231 struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); 232 + DEFINE_WAKE_Q(wakeq); 237 233 int err; 238 234 retry: 239 235 spin_lock_irq(&stopper1->lock); ··· 258 252 goto unlock; 259 253 260 254 err = 0; 261 - __cpu_stop_queue_work(stopper1, work1); 262 - __cpu_stop_queue_work(stopper2, work2); 255 + __cpu_stop_queue_work(stopper1, work1, &wakeq); 256 + __cpu_stop_queue_work(stopper2, work2, &wakeq); 263 257 unlock: 264 258 spin_unlock(&stopper2->lock); 265 259 spin_unlock_irq(&stopper1->lock); ··· 269 263 cpu_relax(); 270 264 goto retry; 271 265 } 266 + 267 + wake_up_q(&wakeq); 268 + 272 269 return err; 273 270 } 274 271 /**