Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'sched-urgent-2026-03-15' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar:
"More MM-CID fixes, mostly fixing hangs/races:

- Fix CID hangs due to a race between concurrent forks

- Fix vfork()/CLONE_VM MMCID bug causing hangs

- Remove pointless preemption guard

- Fix CID task list walk performance regression on large systems
by removing the known-flaky and slow counting logic using
for_each_process_thread() in mm_cid_*fixup_tasks_to_cpus(), and
implementing a simple sched_mm_cid::node list instead"

* tag 'sched-urgent-2026-03-15' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/mmcid: Avoid full tasklist walks
sched/mmcid: Remove pointless preempt guard
sched/mmcid: Handle vfork()/CLONE_VM correctly
sched/mmcid: Prevent CID stalls due to concurrent forks

+35 -57
+5 -1
include/linux/rseq_types.h
··· 133 133 * @active: MM CID is active for the task 134 134 * @cid: The CID associated to the task either permanently or 135 135 * borrowed from the CPU 136 + * @node: Queued in the per MM MMCID list 136 137 */ 137 138 struct sched_mm_cid { 138 139 unsigned int active; 139 140 unsigned int cid; 141 + struct hlist_node node; 140 142 }; 141 143 142 144 /** ··· 159 157 * @work: Regular work to handle the affinity mode change case 160 158 * @lock: Spinlock to protect against affinity setting which can't take @mutex 161 159 * @mutex: Mutex to serialize forks and exits related to this mm 160 + * @user_list: List of the MM CID users of a MM 162 161 * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map 163 162 * is growth only. 164 163 * @users: The number of tasks sharing this MM. Separate from mm::mm_users ··· 180 177 181 178 raw_spinlock_t lock; 182 179 struct mutex mutex; 180 + struct hlist_head user_list; 183 181 184 182 /* Low frequency modified */ 185 183 unsigned int nr_cpus_allowed; 186 184 unsigned int users; 187 185 unsigned int pcpu_thrs; 188 186 unsigned int update_deferred; 189 - }____cacheline_aligned_in_smp; 187 + } ____cacheline_aligned; 190 188 #else /* CONFIG_SCHED_MM_CID */ 191 189 struct mm_mm_cid { }; 192 190 struct sched_mm_cid { };
-2
include/linux/sched.h
··· 2354 2354 #ifdef CONFIG_SCHED_MM_CID 2355 2355 void sched_mm_cid_before_execve(struct task_struct *t); 2356 2356 void sched_mm_cid_after_execve(struct task_struct *t); 2357 - void sched_mm_cid_fork(struct task_struct *t); 2358 2357 void sched_mm_cid_exit(struct task_struct *t); 2359 2358 static __always_inline int task_mm_cid(struct task_struct *t) 2360 2359 { ··· 2362 2363 #else 2363 2364 static inline void sched_mm_cid_before_execve(struct task_struct *t) { } 2364 2365 static inline void sched_mm_cid_after_execve(struct task_struct *t) { } 2365 - static inline void sched_mm_cid_fork(struct task_struct *t) { } 2366 2366 static inline void sched_mm_cid_exit(struct task_struct *t) { } 2367 2367 static __always_inline int task_mm_cid(struct task_struct *t) 2368 2368 {
+1 -2
kernel/fork.c
··· 1000 1000 #ifdef CONFIG_SCHED_MM_CID 1001 1001 tsk->mm_cid.cid = MM_CID_UNSET; 1002 1002 tsk->mm_cid.active = 0; 1003 + INIT_HLIST_NODE(&tsk->mm_cid.node); 1003 1004 #endif 1004 1005 return tsk; 1005 1006 ··· 1587 1586 1588 1587 tsk->mm = mm; 1589 1588 tsk->active_mm = mm; 1590 - sched_mm_cid_fork(tsk); 1591 1589 return 0; 1592 1590 } 1593 1591 ··· 2498 2498 exit_nsproxy_namespaces(p); 2499 2499 bad_fork_cleanup_mm: 2500 2500 if (p->mm) { 2501 - sched_mm_cid_exit(p); 2502 2501 mm_clear_owner(p->mm, p); 2503 2502 mmput(p->mm); 2504 2503 }
+29 -52
kernel/sched/core.c
··· 4729 4729 scx_cancel_fork(p); 4730 4730 } 4731 4731 4732 + static void sched_mm_cid_fork(struct task_struct *t); 4733 + 4732 4734 void sched_post_fork(struct task_struct *p) 4733 4735 { 4736 + sched_mm_cid_fork(p); 4734 4737 uclamp_post_fork(p); 4735 4738 scx_post_fork(p); 4736 4739 } ··· 10620 10617 } 10621 10618 } 10622 10619 10623 - static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm) 10620 + static void mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm) 10624 10621 { 10625 10622 /* Remote access to mm::mm_cid::pcpu requires rq_lock */ 10626 10623 guard(task_rq_lock)(t); 10627 - /* If the task is not active it is not in the users count */ 10628 - if (!t->mm_cid.active) 10629 - return false; 10630 10624 if (cid_on_task(t->mm_cid.cid)) { 10631 10625 /* If running on the CPU, put the CID in transit mode, otherwise drop it */ 10632 10626 if (task_rq(t)->curr == t) ··· 10631 10631 else 10632 10632 mm_unset_cid_on_task(t); 10633 10633 } 10634 - return true; 10635 - } 10636 - 10637 - static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm) 10638 - { 10639 - struct task_struct *p, *t; 10640 - unsigned int users; 10641 - 10642 - /* 10643 - * This can obviously race with a concurrent affinity change, which 10644 - * increases the number of allowed CPUs for this mm, but that does 10645 - * not affect the mode and only changes the CID constraints. A 10646 - * possible switch back to per task mode happens either in the 10647 - * deferred handler function or in the next fork()/exit(). 10648 - * 10649 - * The caller has already transferred. The newly incoming task is 10650 - * already accounted for, but not yet visible. 10651 - */ 10652 - users = mm->mm_cid.users - 2; 10653 - if (!users) 10654 - return; 10655 - 10656 - guard(rcu)(); 10657 - for_other_threads(current, t) { 10658 - if (mm_cid_fixup_task_to_cpu(t, mm)) 10659 - users--; 10660 - } 10661 - 10662 - if (!users) 10663 - return; 10664 - 10665 - /* Happens only for VM_CLONE processes. */ 10666 - for_each_process_thread(p, t) { 10667 - if (t == current || t->mm != mm) 10668 - continue; 10669 - if (mm_cid_fixup_task_to_cpu(t, mm)) { 10670 - if (--users == 0) 10671 - return; 10672 - } 10673 - } 10674 10634 } 10675 10635 10676 10636 static void mm_cid_fixup_tasks_to_cpus(void) 10677 10637 { 10678 10638 struct mm_struct *mm = current->mm; 10639 + struct task_struct *t; 10679 10640 10680 - mm_cid_do_fixup_tasks_to_cpus(mm); 10641 + lockdep_assert_held(&mm->mm_cid.mutex); 10642 + 10643 + hlist_for_each_entry(t, &mm->mm_cid.user_list, mm_cid.node) { 10644 + /* Current has already transferred before invoking the fixup. */ 10645 + if (t != current) 10646 + mm_cid_fixup_task_to_cpu(t, mm); 10647 + } 10648 + 10681 10649 mm_cid_complete_transit(mm, MM_CID_ONCPU); 10682 10650 } 10683 10651 10684 10652 static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm) 10685 10653 { 10654 + lockdep_assert_held(&mm->mm_cid.lock); 10655 + 10686 10656 t->mm_cid.active = 1; 10657 + hlist_add_head(&t->mm_cid.node, &mm->mm_cid.user_list); 10687 10658 mm->mm_cid.users++; 10688 10659 return mm_update_max_cids(mm); 10689 10660 } 10690 10661 10691 - void sched_mm_cid_fork(struct task_struct *t) 10662 + static void sched_mm_cid_fork(struct task_struct *t) 10692 10663 { 10693 10664 struct mm_struct *mm = t->mm; 10694 10665 bool percpu; 10695 10666 10696 - WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET); 10667 + if (!mm) 10668 + return; 10669 + 10670 + WARN_ON_ONCE(t->mm_cid.cid != MM_CID_UNSET); 10697 10671 10698 10672 guard(mutex)(&mm->mm_cid.mutex); 10699 10673 scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { ··· 10706 10732 10707 10733 static bool sched_mm_cid_remove_user(struct task_struct *t) 10708 10734 { 10735 + lockdep_assert_held(&t->mm->mm_cid.lock); 10736 + 10709 10737 t->mm_cid.active = 0; 10710 - scoped_guard(preempt) { 10711 - /* Clear the transition bit */ 10712 - t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid); 10713 - mm_unset_cid_on_task(t); 10714 - } 10738 + /* Clear the transition bit */ 10739 + t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid); 10740 + mm_unset_cid_on_task(t); 10741 + hlist_del_init(&t->mm_cid.node); 10715 10742 t->mm->mm_cid.users--; 10716 10743 return mm_update_max_cids(t->mm); 10717 10744 } ··· 10855 10880 mutex_init(&mm->mm_cid.mutex); 10856 10881 mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work); 10857 10882 INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn); 10883 + INIT_HLIST_HEAD(&mm->mm_cid.user_list); 10858 10884 cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); 10859 10885 bitmap_zero(mm_cidmask(mm), num_possible_cpus()); 10860 10886 } 10861 10887 #else /* CONFIG_SCHED_MM_CID */ 10862 10888 static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { } 10889 + static inline void sched_mm_cid_fork(struct task_struct *t) { } 10863 10890 #endif /* !CONFIG_SCHED_MM_CID */ 10864 10891 10865 10892 static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);