Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'sched_urgent_for_v5.11_rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Borislav Petkov:

- Correct the marking of kthreads which are supposed to run on a
specific, single CPU vs such which are affine to only one CPU, mark
per-cpu workqueue threads as such and make sure that marking
"survives" CPU hotplug. Fix CPU hotplug issues with such kthreads.

- A fix to not push away tasks on CPUs coming online.

- Have workqueue CPU hotplug code use cpu_possible_mask when breaking
affinity on CPU offlining so that pending workers can finish on newly
arrived onlined CPUs too.

- Dump tasks which haven't vacated a CPU which is currently being
unplugged.

- Register a special scale invariance callback which gets called on
resume from RAM to read out APERF/MPERF after resume and thus make
the schedutil scaling governor more precise.

* tag 'sched_urgent_for_v5.11_rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched: Relax the set_cpus_allowed_ptr() semantics
sched: Fix CPU hotplug / tighten is_per_cpu_kthread()
sched: Prepare to use balance_push in ttwu()
workqueue: Restrict affinity change to rescuer
workqueue: Tag bound workers with KTHREAD_IS_PER_CPU
kthread: Extract KTHREAD_IS_PER_CPU
sched: Don't run cpu-online with balance_push() enabled
workqueue: Use cpu_possible_mask instead of cpu_active_mask to break affinity
sched/core: Print out straggler tasks in sched_cpu_dying()
x86: PM: Register syscore_ops for scale invariance

+151 -33
+19
arch/x86/kernel/smpboot.c
··· 56 56 #include <linux/numa.h> 57 57 #include <linux/pgtable.h> 58 58 #include <linux/overflow.h> 59 + #include <linux/syscore_ops.h> 59 60 60 61 #include <asm/acpi.h> 61 62 #include <asm/desc.h> ··· 2084 2083 this_cpu_write(arch_prev_mperf, mperf); 2085 2084 } 2086 2085 2086 + #ifdef CONFIG_PM_SLEEP 2087 + static struct syscore_ops freq_invariance_syscore_ops = { 2088 + .resume = init_counter_refs, 2089 + }; 2090 + 2091 + static void register_freq_invariance_syscore_ops(void) 2092 + { 2093 + /* Bail out if registered already. */ 2094 + if (freq_invariance_syscore_ops.node.prev) 2095 + return; 2096 + 2097 + register_syscore_ops(&freq_invariance_syscore_ops); 2098 + } 2099 + #else 2100 + static inline void register_freq_invariance_syscore_ops(void) {} 2101 + #endif 2102 + 2087 2103 static void init_freq_invariance(bool secondary, bool cppc_ready) 2088 2104 { 2089 2105 bool ret = false; ··· 2127 2109 if (ret) { 2128 2110 init_counter_refs(); 2129 2111 static_branch_enable(&arch_scale_freq_key); 2112 + register_freq_invariance_syscore_ops(); 2130 2113 pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); 2131 2114 } else { 2132 2115 pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
+3
include/linux/kthread.h
··· 33 33 unsigned int cpu, 34 34 const char *namefmt); 35 35 36 + void kthread_set_per_cpu(struct task_struct *k, int cpu); 37 + bool kthread_is_per_cpu(struct task_struct *k); 38 + 36 39 /** 37 40 * kthread_run - create and wake a thread. 38 41 * @threadfn: the function to run until signal_pending(current).
+26 -1
kernel/kthread.c
··· 493 493 return p; 494 494 kthread_bind(p, cpu); 495 495 /* CPU hotplug need to bind once again when unparking the thread. */ 496 - set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); 497 496 to_kthread(p)->cpu = cpu; 498 497 return p; 498 + } 499 + 500 + void kthread_set_per_cpu(struct task_struct *k, int cpu) 501 + { 502 + struct kthread *kthread = to_kthread(k); 503 + if (!kthread) 504 + return; 505 + 506 + WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY)); 507 + 508 + if (cpu < 0) { 509 + clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags); 510 + return; 511 + } 512 + 513 + kthread->cpu = cpu; 514 + set_bit(KTHREAD_IS_PER_CPU, &kthread->flags); 515 + } 516 + 517 + bool kthread_is_per_cpu(struct task_struct *k) 518 + { 519 + struct kthread *kthread = to_kthread(k); 520 + if (!kthread) 521 + return false; 522 + 523 + return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags); 499 524 } 500 525 501 526 /**
+88 -23
kernel/sched/core.c
··· 1796 1796 */ 1797 1797 static inline bool is_cpu_allowed(struct task_struct *p, int cpu) 1798 1798 { 1799 + /* When not in the task's cpumask, no point in looking further. */ 1799 1800 if (!cpumask_test_cpu(cpu, p->cpus_ptr)) 1800 1801 return false; 1801 1802 1802 - if (is_per_cpu_kthread(p) || is_migration_disabled(p)) 1803 + /* migrate_disabled() must be allowed to finish. */ 1804 + if (is_migration_disabled(p)) 1803 1805 return cpu_online(cpu); 1804 1806 1805 - return cpu_active(cpu); 1807 + /* Non kernel threads are not allowed during either online or offline. */ 1808 + if (!(p->flags & PF_KTHREAD)) 1809 + return cpu_active(cpu); 1810 + 1811 + /* KTHREAD_IS_PER_CPU is always allowed. */ 1812 + if (kthread_is_per_cpu(p)) 1813 + return cpu_online(cpu); 1814 + 1815 + /* Regular kernel threads don't get to stay during offline. */ 1816 + if (cpu_rq(cpu)->balance_push) 1817 + return false; 1818 + 1819 + /* But are allowed during online. */ 1820 + return cpu_online(cpu); 1806 1821 } 1807 1822 1808 1823 /* ··· 2342 2327 2343 2328 if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { 2344 2329 /* 2345 - * Kernel threads are allowed on online && !active CPUs. 2330 + * Kernel threads are allowed on online && !active CPUs, 2331 + * however, during cpu-hot-unplug, even these might get pushed 2332 + * away if not KTHREAD_IS_PER_CPU. 2346 2333 * 2347 2334 * Specifically, migration_disabled() tasks must not fail the 2348 2335 * cpumask_any_and_distribute() pick below, esp. so on ··· 2387 2370 } 2388 2371 2389 2372 __do_set_cpus_allowed(p, new_mask, flags); 2390 - 2391 - if (p->flags & PF_KTHREAD) { 2392 - /* 2393 - * For kernel threads that do indeed end up on online && 2394 - * !active we want to ensure they are strict per-CPU threads. 2395 - */ 2396 - WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && 2397 - !cpumask_intersects(new_mask, cpu_active_mask) && 2398 - p->nr_cpus_allowed != 1); 2399 - } 2400 2373 2401 2374 return affine_move_task(rq, p, &rf, dest_cpu, flags); 2402 2375 ··· 3128 3121 3129 3122 static inline bool ttwu_queue_cond(int cpu, int wake_flags) 3130 3123 { 3124 + /* 3125 + * Do not complicate things with the async wake_list while the CPU is 3126 + * in hotplug state. 3127 + */ 3128 + if (!cpu_active(cpu)) 3129 + return false; 3130 + 3131 3131 /* 3132 3132 * If the CPU does not share cache, then queue the task on the 3133 3133 * remote rqs wakelist to avoid accessing remote data. ··· 7290 7276 /* 7291 7277 * Both the cpu-hotplug and stop task are in this case and are 7292 7278 * required to complete the hotplug process. 7279 + * 7280 + * XXX: the idle task does not match kthread_is_per_cpu() due to 7281 + * histerical raisins. 7293 7282 */ 7294 - if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) { 7283 + if (rq->idle == push_task || 7284 + ((push_task->flags & PF_KTHREAD) && kthread_is_per_cpu(push_task)) || 7285 + is_migration_disabled(push_task)) { 7286 + 7295 7287 /* 7296 7288 * If this is the idle task on the outgoing CPU try to wake 7297 7289 * up the hotplug control thread which might wait for the ··· 7329 7309 /* 7330 7310 * At this point need_resched() is true and we'll take the loop in 7331 7311 * schedule(). The next pick is obviously going to be the stop task 7332 - * which is_per_cpu_kthread() and will push this task away. 7312 + * which kthread_is_per_cpu() and will push this task away. 7333 7313 */ 7334 7314 raw_spin_lock(&rq->lock); 7335 7315 } ··· 7340 7320 struct rq_flags rf; 7341 7321 7342 7322 rq_lock_irqsave(rq, &rf); 7343 - if (on) 7323 + rq->balance_push = on; 7324 + if (on) { 7325 + WARN_ON_ONCE(rq->balance_callback); 7344 7326 rq->balance_callback = &balance_push_callback; 7345 - else 7327 + } else if (rq->balance_callback == &balance_push_callback) { 7346 7328 rq->balance_callback = NULL; 7329 + } 7347 7330 rq_unlock_irqrestore(rq, &rf); 7348 7331 } 7349 7332 ··· 7464 7441 struct rq *rq = cpu_rq(cpu); 7465 7442 struct rq_flags rf; 7466 7443 7444 + /* 7445 + * Make sure that when the hotplug state machine does a roll-back 7446 + * we clear balance_push. Ideally that would happen earlier... 7447 + */ 7467 7448 balance_push_set(cpu, false); 7468 7449 7469 7450 #ifdef CONFIG_SCHED_SMT ··· 7510 7483 int ret; 7511 7484 7512 7485 set_cpu_active(cpu, false); 7486 + 7513 7487 /* 7514 - * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU 7515 - * users of this state to go away such that all new such users will 7516 - * observe it. 7488 + * From this point forward, this CPU will refuse to run any task that 7489 + * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively 7490 + * push those tasks away until this gets cleared, see 7491 + * sched_cpu_dying(). 7492 + */ 7493 + balance_push_set(cpu, true); 7494 + 7495 + /* 7496 + * We've cleared cpu_active_mask / set balance_push, wait for all 7497 + * preempt-disabled and RCU users of this state to go away such that 7498 + * all new such users will observe it. 7499 + * 7500 + * Specifically, we rely on ttwu to no longer target this CPU, see 7501 + * ttwu_queue_cond() and is_cpu_allowed(). 7517 7502 * 7518 7503 * Do sync before park smpboot threads to take care the rcu boost case. 7519 7504 */ 7520 7505 synchronize_rcu(); 7521 - 7522 - balance_push_set(cpu, true); 7523 7506 7524 7507 rq_lock_irqsave(rq, &rf); 7525 7508 if (rq->rd) { ··· 7611 7574 atomic_long_add(delta, &calc_load_tasks); 7612 7575 } 7613 7576 7577 + static void dump_rq_tasks(struct rq *rq, const char *loglvl) 7578 + { 7579 + struct task_struct *g, *p; 7580 + int cpu = cpu_of(rq); 7581 + 7582 + lockdep_assert_held(&rq->lock); 7583 + 7584 + printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running); 7585 + for_each_process_thread(g, p) { 7586 + if (task_cpu(p) != cpu) 7587 + continue; 7588 + 7589 + if (!task_on_rq_queued(p)) 7590 + continue; 7591 + 7592 + printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm); 7593 + } 7594 + } 7595 + 7614 7596 int sched_cpu_dying(unsigned int cpu) 7615 7597 { 7616 7598 struct rq *rq = cpu_rq(cpu); ··· 7639 7583 sched_tick_stop(cpu); 7640 7584 7641 7585 rq_lock_irqsave(rq, &rf); 7642 - BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq)); 7586 + if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) { 7587 + WARN(true, "Dying CPU not properly vacated!"); 7588 + dump_rq_tasks(rq, KERN_WARNING); 7589 + } 7643 7590 rq_unlock_irqrestore(rq, &rf); 7591 + 7592 + /* 7593 + * Now that the CPU is offline, make sure we're welcome 7594 + * to new tasks once we come back up. 7595 + */ 7596 + balance_push_set(cpu, false); 7644 7597 7645 7598 calc_load_migrate(rq); 7646 7599 update_max_interval();
+1
kernel/sched/sched.h
··· 975 975 unsigned long cpu_capacity_orig; 976 976 977 977 struct callback_head *balance_callback; 978 + unsigned char balance_push; 978 979 979 980 unsigned char nohz_idle_balance; 980 981 unsigned char idle_balance;
+1
kernel/smpboot.c
··· 188 188 kfree(td); 189 189 return PTR_ERR(tsk); 190 190 } 191 + kthread_set_per_cpu(tsk, cpu); 191 192 /* 192 193 * Park the thread so that it could start right on the CPU 193 194 * when it is available.
+13 -9
kernel/workqueue.c
··· 1849 1849 mutex_lock(&wq_pool_attach_mutex); 1850 1850 1851 1851 /* 1852 - * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any 1853 - * online CPUs. It'll be re-applied when any of the CPUs come up. 1854 - */ 1855 - set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); 1856 - 1857 - /* 1858 1852 * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains 1859 1853 * stable across this function. See the comments above the flag 1860 1854 * definition for details. 1861 1855 */ 1862 1856 if (pool->flags & POOL_DISASSOCIATED) 1863 1857 worker->flags |= WORKER_UNBOUND; 1858 + else 1859 + kthread_set_per_cpu(worker->task, pool->cpu); 1860 + 1861 + if (worker->rescue_wq) 1862 + set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); 1864 1863 1865 1864 list_add_tail(&worker->node, &pool->workers); 1866 1865 worker->pool = pool; ··· 1882 1883 1883 1884 mutex_lock(&wq_pool_attach_mutex); 1884 1885 1886 + kthread_set_per_cpu(worker->task, -1); 1885 1887 list_del(&worker->node); 1886 1888 worker->pool = NULL; 1887 1889 ··· 4919 4919 4920 4920 raw_spin_unlock_irq(&pool->lock); 4921 4921 4922 - for_each_pool_worker(worker, pool) 4923 - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0); 4922 + for_each_pool_worker(worker, pool) { 4923 + kthread_set_per_cpu(worker->task, -1); 4924 + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); 4925 + } 4924 4926 4925 4927 mutex_unlock(&wq_pool_attach_mutex); 4926 4928 ··· 4974 4972 * of all workers first and then clear UNBOUND. As we're called 4975 4973 * from CPU_ONLINE, the following shouldn't fail. 4976 4974 */ 4977 - for_each_pool_worker(worker, pool) 4975 + for_each_pool_worker(worker, pool) { 4976 + kthread_set_per_cpu(worker->task, pool->cpu); 4978 4977 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 4979 4978 pool->attrs->cpumask) < 0); 4979 + } 4980 4980 4981 4981 raw_spin_lock_irq(&pool->lock); 4982 4982