sched/mmcid: Implement deferred mode change

When affinity changes cause an increase of the number of CPUs allowed for
tasks which are related to a MM, that might results in a situation where
the ownership mode can go back from per CPU mode to per task mode.

As affinity changes happen with runqueue lock held there is no way to do
the actual mode change and required fixup right there.

Add the infrastructure to defer it to a workqueue. The scheduled work can
race with a fork() or exit(). Whatever happens first takes care of it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172550.216484739@linutronix.de

Thomas Gleixner 5 months ago 9da6ccbc c809f081

+60 -8

2 changed files

expand all

include

linux

rseq_types.h

kernel

sched

core.c

include/linux/rseq_types.h

··· 2 2 #ifndef _LINUX_RSEQ_TYPES_H 3 3 #define _LINUX_RSEQ_TYPES_H 4 4 5 + #include <linux/irq_work_types.h> 5 6 #include <linux/types.h> 7 + #include <linux/workqueue_types.h> 6 8 7 9 #ifdef CONFIG_RSEQ 8 10 struct rseq; ··· 124 122 * @percpu: Set, when CIDs are in per CPU mode 125 123 * @transit: Set to MM_CID_TRANSIT during a mode change transition phase 126 124 * @max_cids: The exclusive maximum CID value for allocation and convergence 125 + * @irq_work: irq_work to handle the affinity mode change case 126 + * @work: Regular work to handle the affinity mode change case 127 127 * @lock: Spinlock to protect against affinity setting which can't take @mutex 128 128 * @mutex: Mutex to serialize forks and exits related to this mm 129 129 * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map ··· 142 138 unsigned int percpu; 143 139 unsigned int transit; 144 140 unsigned int max_cids; 141 + 142 + /* Rarely used. Moves @lock and @mutex into the second cacheline */ 143 + struct irq_work irq_work; 144 + struct work_struct work; 145 145 146 146 raw_spinlock_t lock; 147 147 struct mutex mutex;

+52 -8

kernel/sched/core.c

··· 10539 10539 10540 10540 /* Adjust the threshold to the wider set */ 10541 10541 mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); 10542 + /* Switch back to per task mode? */ 10543 + if (mc->users >= mc->pcpu_thrs) 10544 + return; 10542 10545 10543 - /* Scheduling of deferred mode switch goes here */ 10546 + /* Don't queue twice */ 10547 + if (mc->update_deferred) 10548 + return; 10549 + 10550 + /* Queue the irq work, which schedules the real work */ 10551 + mc->update_deferred = true; 10552 + irq_work_queue(&mc->irq_work); 10544 10553 } 10545 10554 10546 10555 static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp) ··· 10562 10553 } 10563 10554 } 10564 10555 10565 - static void __maybe_unused mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm) 10556 + static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm) 10566 10557 { 10567 10558 unsigned int cpu; 10568 10559 ··· 10723 10714 mm_cid_select(t); 10724 10715 } 10725 10716 10717 + static void mm_cid_work_fn(struct work_struct *work) 10718 + { 10719 + struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work); 10720 + 10721 + /* Make it compile, but not functional yet */ 10722 + if (!IS_ENABLED(CONFIG_NEW_MM_CID)) 10723 + return; 10724 + 10725 + guard(mutex)(&mm->mm_cid.mutex); 10726 + /* Did the last user task exit already? */ 10727 + if (!mm->mm_cid.users) 10728 + return; 10729 + 10730 + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { 10731 + /* Have fork() or exit() handled it already? */ 10732 + if (!mm->mm_cid.update_deferred) 10733 + return; 10734 + /* This clears mm_cid::update_deferred */ 10735 + if (!mm_update_max_cids(mm)) 10736 + return; 10737 + /* Affinity changes can only switch back to task mode */ 10738 + if (WARN_ON_ONCE(mm->mm_cid.percpu)) 10739 + return; 10740 + } 10741 + mm_cid_fixup_cpus_to_tasks(mm); 10742 + } 10743 + 10744 + static void mm_cid_irq_work(struct irq_work *work) 10745 + { 10746 + struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work); 10747 + 10748 + /* 10749 + * Needs to be unconditional because mm_cid::lock cannot be held 10750 + * when scheduling work as mm_update_cpus_allowed() nests inside 10751 + * rq::lock and schedule_work() might end up in wakeup... 10752 + */ 10753 + schedule_work(&mm->mm_cid.work); 10754 + } 10755 + 10726 10756 void mm_init_cid(struct mm_struct *mm, struct task_struct *p) 10727 10757 { 10728 - struct mm_cid_pcpu __percpu *pcpu = mm->mm_cid.pcpu; 10729 - int cpu; 10730 - 10731 - for_each_possible_cpu(cpu) 10732 - per_cpu_ptr(pcpu, cpu)->cid = MM_CID_UNSET; 10733 - 10734 10758 mm->mm_cid.max_cids = 0; 10735 10759 mm->mm_cid.percpu = 0; 10736 10760 mm->mm_cid.transit = 0; ··· 10773 10731 mm->mm_cid.update_deferred = 0; 10774 10732 raw_spin_lock_init(&mm->mm_cid.lock); 10775 10733 mutex_init(&mm->mm_cid.mutex); 10734 + mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work); 10735 + INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn); 10776 10736 cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); 10777 10737 bitmap_zero(mm_cidmask(mm), num_possible_cpus()); 10778 10738 }

Configure Feed

Configure Feed