Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

sched/mmcid: Avoid full tasklist walks

Chasing vfork()'ed tasks on a CID ownership mode switch requires a full
task list walk, which is obviously expensive on large systems.

Avoid that by keeping a list of tasks using a mm MMCID entity in mm::mm_cid
and walk this list instead. This removes the proven to be flaky counting
logic and avoids a full task list walk in the case of vfork()'ed tasks.

Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions")
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20260310202526.183824481@kernel.org

authored by

Thomas Gleixner and committed by
Peter Zijlstra
192d8521 7574ac6e

+19 -44
+5 -1
include/linux/rseq_types.h
··· 133 133 * @active: MM CID is active for the task 134 134 * @cid: The CID associated to the task either permanently or 135 135 * borrowed from the CPU 136 + * @node: Queued in the per MM MMCID list 136 137 */ 137 138 struct sched_mm_cid { 138 139 unsigned int active; 139 140 unsigned int cid; 141 + struct hlist_node node; 140 142 }; 141 143 142 144 /** ··· 159 157 * @work: Regular work to handle the affinity mode change case 160 158 * @lock: Spinlock to protect against affinity setting which can't take @mutex 161 159 * @mutex: Mutex to serialize forks and exits related to this mm 160 + * @user_list: List of the MM CID users of a MM 162 161 * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map 163 162 * is growth only. 164 163 * @users: The number of tasks sharing this MM. Separate from mm::mm_users ··· 180 177 181 178 raw_spinlock_t lock; 182 179 struct mutex mutex; 180 + struct hlist_head user_list; 183 181 184 182 /* Low frequency modified */ 185 183 unsigned int nr_cpus_allowed; 186 184 unsigned int users; 187 185 unsigned int pcpu_thrs; 188 186 unsigned int update_deferred; 189 - }____cacheline_aligned_in_smp; 187 + } ____cacheline_aligned; 190 188 #else /* CONFIG_SCHED_MM_CID */ 191 189 struct mm_mm_cid { }; 192 190 struct sched_mm_cid { };
+1
kernel/fork.c
··· 1000 1000 #ifdef CONFIG_SCHED_MM_CID 1001 1001 tsk->mm_cid.cid = MM_CID_UNSET; 1002 1002 tsk->mm_cid.active = 0; 1003 + INIT_HLIST_NODE(&tsk->mm_cid.node); 1003 1004 #endif 1004 1005 return tsk; 1005 1006
+13 -43
kernel/sched/core.c
··· 10620 10620 } 10621 10621 } 10622 10622 10623 - static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm) 10623 + static void mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm) 10624 10624 { 10625 10625 /* Remote access to mm::mm_cid::pcpu requires rq_lock */ 10626 10626 guard(task_rq_lock)(t); 10627 - /* If the task is not active it is not in the users count */ 10628 - if (!t->mm_cid.active) 10629 - return false; 10630 10627 if (cid_on_task(t->mm_cid.cid)) { 10631 10628 /* If running on the CPU, put the CID in transit mode, otherwise drop it */ 10632 10629 if (task_rq(t)->curr == t) ··· 10631 10634 else 10632 10635 mm_unset_cid_on_task(t); 10633 10636 } 10634 - return true; 10635 - } 10636 - 10637 - static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm) 10638 - { 10639 - struct task_struct *p, *t; 10640 - unsigned int users; 10641 - 10642 - /* 10643 - * This can obviously race with a concurrent affinity change, which 10644 - * increases the number of allowed CPUs for this mm, but that does 10645 - * not affect the mode and only changes the CID constraints. A 10646 - * possible switch back to per task mode happens either in the 10647 - * deferred handler function or in the next fork()/exit(). 10648 - * 10649 - * The caller has already transferred so remove it from the users 10650 - * count. The incoming task is already visible and has mm_cid.active, 10651 - * but has task::mm_cid::cid == UNSET. Still it needs to be accounted 10652 - * for. Concurrent fork()s might add more threads, but all of them have 10653 - * task::mm_cid::active = 0, so they don't affect the accounting here. 10654 - */ 10655 - users = mm->mm_cid.users - 1; 10656 - 10657 - guard(rcu)(); 10658 - for_other_threads(current, t) { 10659 - if (mm_cid_fixup_task_to_cpu(t, mm)) 10660 - users--; 10661 - } 10662 - 10663 - if (!users) 10664 - return; 10665 - 10666 - /* Happens only for VM_CLONE processes. */ 10667 - for_each_process_thread(p, t) { 10668 - if (t == current || t->mm != mm) 10669 - continue; 10670 - mm_cid_fixup_task_to_cpu(t, mm); 10671 - } 10672 10637 } 10673 10638 10674 10639 static void mm_cid_fixup_tasks_to_cpus(void) 10675 10640 { 10676 10641 struct mm_struct *mm = current->mm; 10642 + struct task_struct *t; 10677 10643 10678 - mm_cid_do_fixup_tasks_to_cpus(mm); 10644 + lockdep_assert_held(&mm->mm_cid.mutex); 10645 + 10646 + hlist_for_each_entry(t, &mm->mm_cid.user_list, mm_cid.node) { 10647 + /* Current has already transferred before invoking the fixup. */ 10648 + if (t != current) 10649 + mm_cid_fixup_task_to_cpu(t, mm); 10650 + } 10651 + 10679 10652 mm_cid_complete_transit(mm, MM_CID_ONCPU); 10680 10653 } 10681 10654 ··· 10654 10687 lockdep_assert_held(&mm->mm_cid.lock); 10655 10688 10656 10689 t->mm_cid.active = 1; 10690 + hlist_add_head(&t->mm_cid.node, &mm->mm_cid.user_list); 10657 10691 mm->mm_cid.users++; 10658 10692 return mm_update_max_cids(mm); 10659 10693 } ··· 10712 10744 /* Clear the transition bit */ 10713 10745 t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid); 10714 10746 mm_unset_cid_on_task(t); 10747 + hlist_del_init(&t->mm_cid.node); 10715 10748 t->mm->mm_cid.users--; 10716 10749 return mm_update_max_cids(t->mm); 10717 10750 } ··· 10855 10886 mutex_init(&mm->mm_cid.mutex); 10856 10887 mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work); 10857 10888 INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn); 10889 + INIT_HLIST_HEAD(&mm->mm_cid.user_list); 10858 10890 cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); 10859 10891 bitmap_zero(mm_cidmask(mm), num_possible_cpus()); 10860 10892 }