Merge tag 'sched_urgent_for_5.8_rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+8 -18

include/linux/irq_work.h

··· 2 2 #ifndef _LINUX_IRQ_WORK_H 3 3 #define _LINUX_IRQ_WORK_H 4 4 5 - #include <linux/llist.h> 5 + #include <linux/smp_types.h> 6 6 7 7 /* 8 8 * An entry can be in one of four states: ··· 13 13 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed 14 14 */ 15 15 16 - /* flags share CSD_FLAG_ space */ 17 - 18 - #define IRQ_WORK_PENDING BIT(0) 19 - #define IRQ_WORK_BUSY BIT(1) 20 - 21 - /* Doesn't want IPI, wait for tick: */ 22 - #define IRQ_WORK_LAZY BIT(2) 23 - /* Run hard IRQ context, even on RT */ 24 - #define IRQ_WORK_HARD_IRQ BIT(3) 25 - 26 - #define IRQ_WORK_CLAIMED (IRQ_WORK_PENDING | IRQ_WORK_BUSY) 27 - 28 - /* 29 - * structure shares layout with single_call_data_t. 30 - */ 31 16 struct irq_work { 32 - struct llist_node llnode; 33 - atomic_t flags; 17 + union { 18 + struct __call_single_node node; 19 + struct { 20 + struct llist_node llnode; 21 + atomic_t flags; 22 + }; 23 + }; 34 24 void (*func)(struct irq_work *); 35 25 }; 36 26

+1 -2

include/linux/sched.h

··· 654 654 unsigned int ptrace; 655 655 656 656 #ifdef CONFIG_SMP 657 - struct llist_node wake_entry; 658 - unsigned int wake_entry_type; 659 657 int on_cpu; 658 + struct __call_single_node wake_entry; 660 659 #ifdef CONFIG_THREAD_INFO_IN_TASK 661 660 /* Current CPU: */ 662 661 unsigned int cpu;

+8 -15

include/linux/smp.h

··· 12 12 #include <linux/list.h> 13 13 #include <linux/cpumask.h> 14 14 #include <linux/init.h> 15 - #include <linux/llist.h> 15 + #include <linux/smp_types.h> 16 16 17 17 typedef void (*smp_call_func_t)(void *info); 18 18 typedef bool (*smp_cond_func_t)(int cpu, void *info); 19 - 20 - enum { 21 - CSD_FLAG_LOCK = 0x01, 22 - 23 - /* IRQ_WORK_flags */ 24 - 25 - CSD_TYPE_ASYNC = 0x00, 26 - CSD_TYPE_SYNC = 0x10, 27 - CSD_TYPE_IRQ_WORK = 0x20, 28 - CSD_TYPE_TTWU = 0x30, 29 - CSD_FLAG_TYPE_MASK = 0xF0, 30 - }; 31 19 32 20 /* 33 21 * structure shares (partial) layout with struct irq_work 34 22 */ 35 23 struct __call_single_data { 36 - struct llist_node llist; 37 - unsigned int flags; 24 + union { 25 + struct __call_single_node node; 26 + struct { 27 + struct llist_node llist; 28 + unsigned int flags; 29 + }; 30 + }; 38 31 smp_call_func_t func; 39 32 void *info; 40 33 };

+66

include/linux/smp_types.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef __LINUX_SMP_TYPES_H 3 + #define __LINUX_SMP_TYPES_H 4 + 5 + #include <linux/llist.h> 6 + 7 + enum { 8 + CSD_FLAG_LOCK = 0x01, 9 + 10 + IRQ_WORK_PENDING = 0x01, 11 + IRQ_WORK_BUSY = 0x02, 12 + IRQ_WORK_LAZY = 0x04, /* No IPI, wait for tick */ 13 + IRQ_WORK_HARD_IRQ = 0x08, /* IRQ context on PREEMPT_RT */ 14 + 15 + IRQ_WORK_CLAIMED = (IRQ_WORK_PENDING | IRQ_WORK_BUSY), 16 + 17 + CSD_TYPE_ASYNC = 0x00, 18 + CSD_TYPE_SYNC = 0x10, 19 + CSD_TYPE_IRQ_WORK = 0x20, 20 + CSD_TYPE_TTWU = 0x30, 21 + 22 + CSD_FLAG_TYPE_MASK = 0xF0, 23 + }; 24 + 25 + /* 26 + * struct __call_single_node is the primary type on 27 + * smp.c:call_single_queue. 28 + * 29 + * flush_smp_call_function_queue() only reads the type from 30 + * __call_single_node::u_flags as a regular load, the above 31 + * (anonymous) enum defines all the bits of this word. 32 + * 33 + * Other bits are not modified until the type is known. 34 + * 35 + * CSD_TYPE_SYNC/ASYNC: 36 + * struct { 37 + * struct llist_node node; 38 + * unsigned int flags; 39 + * smp_call_func_t func; 40 + * void *info; 41 + * }; 42 + * 43 + * CSD_TYPE_IRQ_WORK: 44 + * struct { 45 + * struct llist_node node; 46 + * atomic_t flags; 47 + * void (*func)(struct irq_work *); 48 + * }; 49 + * 50 + * CSD_TYPE_TTWU: 51 + * struct { 52 + * struct llist_node node; 53 + * unsigned int flags; 54 + * }; 55 + * 56 + */ 57 + 58 + struct __call_single_node { 59 + struct llist_node llist; 60 + union { 61 + unsigned int u_flags; 62 + atomic_t a_flags; 63 + }; 64 + }; 65 + 66 + #endif /* __LINUX_SMP_TYPES_H */

+34 -10

kernel/sched/core.c

··· 1637 1637 goto out; 1638 1638 } 1639 1639 1640 - if (cpumask_equal(p->cpus_ptr, new_mask)) 1640 + if (cpumask_equal(&p->cpus_mask, new_mask)) 1641 1641 goto out; 1642 1642 1643 1643 /* ··· 2293 2293 rq_lock_irqsave(rq, &rf); 2294 2294 update_rq_clock(rq); 2295 2295 2296 - llist_for_each_entry_safe(p, t, llist, wake_entry) 2296 + llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { 2297 + if (WARN_ON_ONCE(p->on_cpu)) 2298 + smp_cond_load_acquire(&p->on_cpu, !VAL); 2299 + 2300 + if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) 2301 + set_task_cpu(p, cpu_of(rq)); 2302 + 2297 2303 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); 2304 + } 2298 2305 2299 2306 rq_unlock_irqrestore(rq, &rf); 2300 2307 } ··· 2329 2322 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); 2330 2323 2331 2324 WRITE_ONCE(rq->ttwu_pending, 1); 2332 - __smp_call_single_queue(cpu, &p->wake_entry); 2325 + __smp_call_single_queue(cpu, &p->wake_entry.llist); 2333 2326 } 2334 2327 2335 2328 void wake_up_if_idle(int cpu) ··· 2376 2369 * the soon-to-be-idle CPU as the current CPU is likely busy. 2377 2370 * nr_running is checked to avoid unnecessary task stacking. 2378 2371 */ 2379 - if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1) 2372 + if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) 2380 2373 return true; 2381 2374 2382 2375 return false; ··· 2385 2378 static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) 2386 2379 { 2387 2380 if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { 2381 + if (WARN_ON_ONCE(cpu == smp_processor_id())) 2382 + return false; 2383 + 2388 2384 sched_clock_cpu(cpu); /* Sync clocks across CPUs */ 2389 2385 __ttwu_queue_wakelist(p, cpu, wake_flags); 2390 2386 return true; ··· 2538 2528 goto out; 2539 2529 2540 2530 success = 1; 2541 - cpu = task_cpu(p); 2542 2531 trace_sched_waking(p); 2543 2532 p->state = TASK_RUNNING; 2544 2533 trace_sched_wakeup(p); ··· 2559 2550 2560 2551 /* We're going to change ->state: */ 2561 2552 success = 1; 2562 - cpu = task_cpu(p); 2563 2553 2564 2554 /* 2565 2555 * Ensure we load p->on_rq _after_ p->state, otherwise it would ··· 2622 2614 * which potentially sends an IPI instead of spinning on p->on_cpu to 2623 2615 * let the waker make forward progress. This is safe because IRQs are 2624 2616 * disabled and the IPI will deliver after on_cpu is cleared. 2617 + * 2618 + * Ensure we load task_cpu(p) after p->on_cpu: 2619 + * 2620 + * set_task_cpu(p, cpu); 2621 + * STORE p->cpu = @cpu 2622 + * __schedule() (switch to task 'p') 2623 + * LOCK rq->lock 2624 + * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) 2625 + * STORE p->on_cpu = 1 LOAD p->cpu 2626 + * 2627 + * to ensure we observe the correct CPU on which the task is currently 2628 + * scheduling. 2625 2629 */ 2626 - if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ)) 2630 + if (smp_load_acquire(&p->on_cpu) && 2631 + ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) 2627 2632 goto unlock; 2628 2633 2629 2634 /* ··· 2656 2635 psi_ttwu_dequeue(p); 2657 2636 set_task_cpu(p, cpu); 2658 2637 } 2638 + #else 2639 + cpu = task_cpu(p); 2659 2640 #endif /* CONFIG_SMP */ 2660 2641 2661 2642 ttwu_queue(p, cpu, wake_flags); ··· 2665 2642 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2666 2643 out: 2667 2644 if (success) 2668 - ttwu_stat(p, cpu, wake_flags); 2645 + ttwu_stat(p, task_cpu(p), wake_flags); 2669 2646 preempt_enable(); 2670 2647 2671 2648 return success; ··· 2786 2763 #endif 2787 2764 init_numa_balancing(clone_flags, p); 2788 2765 #ifdef CONFIG_SMP 2789 - p->wake_entry_type = CSD_TYPE_TTWU; 2766 + p->wake_entry.u_flags = CSD_TYPE_TTWU; 2790 2767 #endif 2791 2768 } 2792 2769 ··· 4556 4533 */ 4557 4534 if (dl_prio(prio)) { 4558 4535 if (!dl_prio(p->normal_prio) || 4559 - (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { 4536 + (pi_task && dl_prio(pi_task->prio) && 4537 + dl_entity_preempt(&pi_task->dl, &p->dl))) { 4560 4538 p->dl.dl_boosted = 1; 4561 4539 queue_flag |= ENQUEUE_REPLENISH; 4562 4540 } else

+1

kernel/sched/deadline.c

··· 2692 2692 dl_se->dl_bw = 0; 2693 2693 dl_se->dl_density = 0; 2694 2694 2695 + dl_se->dl_boosted = 0; 2695 2696 dl_se->dl_throttled = 0; 2696 2697 dl_se->dl_yielded = 0; 2697 2698 dl_se->dl_non_contending = 0;

+1 -1

kernel/sched/fair.c

··· 806 806 } 807 807 } 808 808 809 - sa->runnable_avg = cpu_scale; 809 + sa->runnable_avg = sa->util_avg; 810 810 811 811 if (p->sched_class != &fair_sched_class) { 812 812 /*

+1 -1

kernel/sched/sched.h

··· 1682 1682 #define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ 1683 1683 #define WF_FORK 0x02 /* Child wakeup after fork */ 1684 1684 #define WF_MIGRATED 0x04 /* Internal use, task got migrated */ 1685 - #define WF_ON_RQ 0x08 /* Wakee is on_rq */ 1685 + #define WF_ON_CPU 0x08 /* Wakee is on_cpu */ 1686 1686 1687 1687 /* 1688 1688 * To aid in avoiding the subversion of "niceness" due to uneven distribution

-18

kernel/smp.c

··· 669 669 { 670 670 int num_nodes, num_cpus; 671 671 672 - /* 673 - * Ensure struct irq_work layout matches so that 674 - * flush_smp_call_function_queue() can do horrible things. 675 - */ 676 - BUILD_BUG_ON(offsetof(struct irq_work, llnode) != 677 - offsetof(struct __call_single_data, llist)); 678 - BUILD_BUG_ON(offsetof(struct irq_work, func) != 679 - offsetof(struct __call_single_data, func)); 680 - BUILD_BUG_ON(offsetof(struct irq_work, flags) != 681 - offsetof(struct __call_single_data, flags)); 682 - 683 - /* 684 - * Assert the CSD_TYPE_TTWU layout is similar enough 685 - * for task_struct to be on the @call_single_queue. 686 - */ 687 - BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) != 688 - offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist)); 689 - 690 672 idle_threads_init(); 691 673 cpuhp_threads_init(); 692 674

Configure Feed

Configure Feed