sched/mmcid: Provide precomputed maximal value

Reading mm::mm_users and mm:::mm_cid::nr_cpus_allowed every time to compute
the maximal CID value is just wasteful as that value is only changing on
fork(), exit() and eventually when the affinity changes.

So it can be easily precomputed at those points and provided in mm::mm_cid
for consumption in the hot path.

But there is an issue with using mm::mm_users for accounting because that
does not necessarily reflect the number of user space tasks as other kernel
code can take temporary references on the MM which skew the picture.

Solve that by adding a users counter to struct mm_mm_cid, which is modified
by fork() and exit() and used for precomputing under mm_mm_cid::lock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.832764634@linutronix.de

Thomas Gleixner 6 months ago b0c3d51b bf070520

+50 -19

4 changed files

expand all

include

linux

rseq_types.h

kernel

fork.c

sched

core.c

sched.h

include/linux/rseq_types.h

··· 117 117 /** 118 118 * struct mm_mm_cid - Storage for per MM CID data 119 119 * @pcpu: Per CPU storage for CIDs associated to a CPU 120 + * @max_cids: The exclusive maximum CID value for allocation and convergence 120 121 * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map 121 122 * is growth only. 123 + * @users: The number of tasks sharing this MM. Separate from mm::mm_users 124 + * as that is modified by mmget()/mm_put() by other entities which 125 + * do not actually share the MM. 122 126 * @lock: Spinlock to protect all fields except @pcpu. It also protects 123 127 * the MM cid cpumask and the MM cidmask bitmap. 124 128 */ 125 129 struct mm_mm_cid { 126 130 struct mm_cid_pcpu __percpu *pcpu; 131 + unsigned int max_cids; 127 132 unsigned int nr_cpus_allowed; 133 + unsigned int users; 128 134 raw_spinlock_t lock; 129 135 }____cacheline_aligned_in_smp; 130 136 #else /* CONFIG_SCHED_MM_CID */

kernel/fork.c

··· 2455 2455 exit_task_namespaces(p); 2456 2456 bad_fork_cleanup_mm: 2457 2457 if (p->mm) { 2458 + sched_mm_cid_exit(p); 2458 2459 mm_clear_owner(p->mm, p); 2459 2460 mmput(p->mm); 2460 2461 }

+42 -17

kernel/sched/core.c

··· 4485 4485 init_numa_balancing(clone_flags, p); 4486 4486 p->wake_entry.u_flags = CSD_TYPE_TTWU; 4487 4487 p->migration_pending = NULL; 4488 - init_sched_mm_cid(p); 4489 4488 } 4490 4489 4491 4490 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); ··· 10370 10371 10371 10372 #ifdef CONFIG_SCHED_MM_CID 10372 10373 /* 10373 - * When a task exits, the MM CID held by the task is not longer required as 10374 - * the task cannot return to user space. 10374 + * Update the CID range properties when the constraints change. Invoked via 10375 + * fork(), exit() and affinity changes 10375 10376 */ 10377 + static void mm_update_max_cids(struct mm_struct *mm) 10378 + { 10379 + struct mm_mm_cid *mc = &mm->mm_cid; 10380 + unsigned int max_cids; 10381 + 10382 + lockdep_assert_held(&mm->mm_cid.lock); 10383 + 10384 + /* Calculate the new maximum constraint */ 10385 + max_cids = min(mc->nr_cpus_allowed, mc->users); 10386 + WRITE_ONCE(mc->max_cids, max_cids); 10387 + } 10388 + 10376 10389 static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) 10377 10390 { 10378 10391 struct cpumask *mm_allowed; 10379 10392 unsigned int weight; 10380 10393 10381 - if (!mm) 10394 + if (!mm || !READ_ONCE(mm->mm_cid.users)) 10382 10395 return; 10383 10396 10384 10397 /* ··· 10400 10389 guard(raw_spinlock)(&mm->mm_cid.lock); 10401 10390 mm_allowed = mm_cpus_allowed(mm); 10402 10391 weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk); 10392 + if (weight == mm->mm_cid.nr_cpus_allowed) 10393 + return; 10403 10394 WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, weight); 10395 + mm_update_max_cids(mm); 10404 10396 } 10405 10397 10398 + void sched_mm_cid_fork(struct task_struct *t) 10399 + { 10400 + struct mm_struct *mm = t->mm; 10401 + 10402 + WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET); 10403 + 10404 + guard(raw_spinlock)(&mm->mm_cid.lock); 10405 + t->mm_cid.active = 1; 10406 + mm->mm_cid.users++; 10407 + /* Preset last_cid for mm_cid_select() */ 10408 + t->mm_cid.last_cid = READ_ONCE(mm->mm_cid.max_cids) - 1; 10409 + mm_update_max_cids(mm); 10410 + } 10411 + 10412 + /* 10413 + * When a task exits, the MM CID held by the task is not longer required as 10414 + * the task cannot return to user space. 10415 + */ 10406 10416 void sched_mm_cid_exit(struct task_struct *t) 10407 10417 { 10408 10418 struct mm_struct *mm = t->mm; ··· 10431 10399 if (!mm || !t->mm_cid.active) 10432 10400 return; 10433 10401 10434 - guard(preempt)(); 10402 + guard(raw_spinlock)(&mm->mm_cid.lock); 10435 10403 t->mm_cid.active = 0; 10404 + mm->mm_cid.users--; 10436 10405 if (t->mm_cid.cid != MM_CID_UNSET) { 10437 10406 clear_bit(t->mm_cid.cid, mm_cidmask(mm)); 10438 10407 t->mm_cid.cid = MM_CID_UNSET; 10439 10408 } 10409 + mm_update_max_cids(mm); 10440 10410 } 10441 10411 10442 10412 /* Deactivate MM CID allocation across execve() */ ··· 10450 10416 /* Reactivate MM CID after successful execve() */ 10451 10417 void sched_mm_cid_after_execve(struct task_struct *t) 10452 10418 { 10453 - struct mm_struct *mm = t->mm; 10454 - 10455 - if (!mm) 10456 - return; 10457 - 10419 + sched_mm_cid_fork(t); 10458 10420 guard(preempt)(); 10459 - t->mm_cid.active = 1; 10460 10421 mm_cid_select(t); 10461 - } 10462 - 10463 - void sched_mm_cid_fork(struct task_struct *t) 10464 - { 10465 - WARN_ON_ONCE(!t->mm || t->mm_cid.cid != MM_CID_UNSET); 10466 - t->mm_cid.active = 1; 10467 10422 } 10468 10423 10469 10424 void mm_init_cid(struct mm_struct *mm, struct task_struct *p) ··· 10463 10440 for_each_possible_cpu(cpu) 10464 10441 per_cpu_ptr(pcpu, cpu)->cid = MM_CID_UNSET; 10465 10442 10443 + mm->mm_cid.max_cids = 0; 10466 10444 mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; 10445 + mm->mm_cid.users = 0; 10467 10446 raw_spin_lock_init(&mm->mm_cid.lock); 10468 10447 cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); 10469 10448 bitmap_zero(mm_cidmask(mm), num_possible_cpus());

+1 -2

kernel/sched/sched.h

··· 3571 3571 struct mm_struct *mm = t->mm; 3572 3572 unsigned int max_cids; 3573 3573 3574 - max_cids = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users)); 3574 + max_cids = READ_ONCE(mm->mm_cid.max_cids); 3575 3575 3576 3576 /* Try to reuse the last CID of this task */ 3577 3577 if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids)) ··· 3614 3614 } 3615 3615 3616 3616 #else /* !CONFIG_SCHED_MM_CID: */ 3617 - static inline void init_sched_mm_cid(struct task_struct *t) { } 3618 3617 static inline void mm_cid_select(struct task_struct *t) { } 3619 3618 static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { } 3620 3619 #endif /* !CONFIG_SCHED_MM_CID */

Configure Feed

Configure Feed