Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

sched/mmcid: Revert the complex CID management

The CID management is a complex beast, which affects both scheduling and
task migration. The compaction mechanism forces random tasks of a process
into task work on exit to user space causing latency spikes.

Revert back to the initial simple bitmap allocating mechanics, which are
known to have scalability issues as that allows to gradually build up a
replacement functionality in a reviewable way.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251119172549.068197830@linutronix.de

authored by

Thomas Gleixner and committed by
Peter Zijlstra
77d7dc8b 80adaccf

+66 -802
+2 -51
include/linux/mm_types.h
··· 922 922 #define vma_policy(vma) NULL 923 923 #endif 924 924 925 - #ifdef CONFIG_SCHED_MM_CID 926 925 struct mm_cid { 927 - u64 time; 928 - int cid; 929 - int recent_cid; 926 + unsigned int cid; 930 927 }; 931 - #endif 932 928 933 929 /* 934 930 * Opaque type representing current mm_struct flag state. Must be accessed via ··· 996 1000 * runqueue locks. 997 1001 */ 998 1002 struct mm_cid __percpu *pcpu_cid; 999 - /* 1000 - * @mm_cid_next_scan: Next mm_cid scan (in jiffies). 1001 - * 1002 - * When the next mm_cid scan is due (in jiffies). 1003 - */ 1004 - unsigned long mm_cid_next_scan; 1005 1003 /** 1006 1004 * @nr_cpus_allowed: Number of CPUs allowed for mm. 1007 1005 * ··· 1003 1013 * threads allowed CPUs. 1004 1014 */ 1005 1015 unsigned int nr_cpus_allowed; 1006 - /** 1007 - * @max_nr_cid: Maximum number of allowed concurrency 1008 - * IDs allocated. 1009 - * 1010 - * Track the highest number of allowed concurrency IDs 1011 - * allocated for the mm. 1012 - */ 1013 - atomic_t max_nr_cid; 1014 1016 /** 1015 1017 * @cpus_allowed_lock: Lock protecting mm cpus_allowed. 1016 1018 * ··· 1353 1371 1354 1372 #ifdef CONFIG_SCHED_MM_CID 1355 1373 1356 - enum mm_cid_state { 1357 - MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */ 1358 - MM_CID_LAZY_PUT = (1U << 31), 1359 - }; 1360 - 1361 - static inline bool mm_cid_is_unset(int cid) 1362 - { 1363 - return cid == MM_CID_UNSET; 1364 - } 1365 - 1366 - static inline bool mm_cid_is_lazy_put(int cid) 1367 - { 1368 - return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT); 1369 - } 1370 - 1371 - static inline bool mm_cid_is_valid(int cid) 1372 - { 1373 - return !(cid & MM_CID_LAZY_PUT); 1374 - } 1375 - 1376 - static inline int mm_cid_set_lazy_put(int cid) 1377 - { 1378 - return cid | MM_CID_LAZY_PUT; 1379 - } 1380 - 1381 - static inline int mm_cid_clear_lazy_put(int cid) 1382 - { 1383 - return cid & ~MM_CID_LAZY_PUT; 1384 - } 1374 + #define MM_CID_UNSET (~0U) 1385 1375 1386 1376 /* 1387 1377 * mm_cpus_allowed: Union of all mm's threads allowed CPUs. ··· 1386 1432 struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); 1387 1433 1388 1434 pcpu_cid->cid = MM_CID_UNSET; 1389 - pcpu_cid->recent_cid = MM_CID_UNSET; 1390 - pcpu_cid->time = 0; 1391 1435 } 1392 1436 mm->nr_cpus_allowed = p->nr_cpus_allowed; 1393 - atomic_set(&mm->max_nr_cid, 0); 1394 1437 raw_spin_lock_init(&mm->cpus_allowed_lock); 1395 1438 cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); 1396 1439 cpumask_clear(mm_cidmask(mm));
+2 -3
kernel/fork.c
··· 955 955 #endif 956 956 957 957 #ifdef CONFIG_SCHED_MM_CID 958 - tsk->mm_cid = -1; 959 - tsk->last_mm_cid = -1; 958 + tsk->mm_cid = MM_CID_UNSET; 959 + tsk->last_mm_cid = MM_CID_UNSET; 960 960 tsk->mm_cid_active = 0; 961 - tsk->migrate_from_cpu = -1; 962 961 #endif 963 962 return tsk; 964 963
+17 -500
kernel/sched/core.c
··· 2128 2128 { 2129 2129 if (task_on_rq_migrating(p)) 2130 2130 flags |= ENQUEUE_MIGRATED; 2131 - if (flags & ENQUEUE_MIGRATED) 2132 - sched_mm_cid_migrate_to(rq, p); 2133 2131 2134 2132 enqueue_task(rq, p, flags); 2135 2133 ··· 3327 3329 if (p->sched_class->migrate_task_rq) 3328 3330 p->sched_class->migrate_task_rq(p, new_cpu); 3329 3331 p->se.nr_migrations++; 3330 - sched_mm_cid_migrate_from(p); 3331 3332 perf_event_task_migrate(p); 3332 3333 } 3333 3334 ··· 5277 5280 * 5278 5281 * kernel -> user switch + mmdrop_lazy_tlb() active 5279 5282 * user -> user switch 5280 - * 5281 - * switch_mm_cid() needs to be updated if the barriers provided 5282 - * by context_switch() are modified. 5283 5283 */ 5284 5284 if (!next->mm) { // to kernel 5285 5285 enter_lazy_tlb(prev->active_mm, next); ··· 5306 5312 } 5307 5313 } 5308 5314 5309 - /* switch_mm_cid() requires the memory barriers above. */ 5310 - switch_mm_cid(rq, prev, next); 5315 + switch_mm_cid(prev, next); 5311 5316 5312 5317 /* 5313 5318 * Tell rseq that the task was scheduled in. Must be after ··· 5597 5604 resched_latency = cpu_resched_latency(rq); 5598 5605 calc_global_load_tick(rq); 5599 5606 sched_core_tick(rq); 5600 - task_tick_mm_cid(rq, donor); 5601 5607 scx_tick(rq); 5602 5608 5603 5609 rq_unlock(rq, &rf); ··· 10368 10376 } 10369 10377 10370 10378 #ifdef CONFIG_SCHED_MM_CID 10371 - 10372 10379 /* 10373 - * @cid_lock: Guarantee forward-progress of cid allocation. 10374 - * 10375 - * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock 10376 - * is only used when contention is detected by the lock-free allocation so 10377 - * forward progress can be guaranteed. 10380 + * When a task exits, the MM CID held by the task is not longer required as 10381 + * the task cannot return to user space. 10378 10382 */ 10379 - DEFINE_RAW_SPINLOCK(cid_lock); 10380 - 10381 - /* 10382 - * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock. 10383 - * 10384 - * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is 10385 - * detected, it is set to 1 to ensure that all newly coming allocations are 10386 - * serialized by @cid_lock until the allocation which detected contention 10387 - * completes and sets @use_cid_lock back to 0. This guarantees forward progress 10388 - * of a cid allocation. 10389 - */ 10390 - int use_cid_lock; 10391 - 10392 - /* 10393 - * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid 10394 - * concurrently with respect to the execution of the source runqueue context 10395 - * switch. 10396 - * 10397 - * There is one basic properties we want to guarantee here: 10398 - * 10399 - * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively 10400 - * used by a task. That would lead to concurrent allocation of the cid and 10401 - * userspace corruption. 10402 - * 10403 - * Provide this guarantee by introducing a Dekker memory ordering to guarantee 10404 - * that a pair of loads observe at least one of a pair of stores, which can be 10405 - * shown as: 10406 - * 10407 - * X = Y = 0 10408 - * 10409 - * w[X]=1 w[Y]=1 10410 - * MB MB 10411 - * r[Y]=y r[X]=x 10412 - * 10413 - * Which guarantees that x==0 && y==0 is impossible. But rather than using 10414 - * values 0 and 1, this algorithm cares about specific state transitions of the 10415 - * runqueue current task (as updated by the scheduler context switch), and the 10416 - * per-mm/cpu cid value. 10417 - * 10418 - * Let's introduce task (Y) which has task->mm == mm and task (N) which has 10419 - * task->mm != mm for the rest of the discussion. There are two scheduler state 10420 - * transitions on context switch we care about: 10421 - * 10422 - * (TSA) Store to rq->curr with transition from (N) to (Y) 10423 - * 10424 - * (TSB) Store to rq->curr with transition from (Y) to (N) 10425 - * 10426 - * On the remote-clear side, there is one transition we care about: 10427 - * 10428 - * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag 10429 - * 10430 - * There is also a transition to UNSET state which can be performed from all 10431 - * sides (scheduler, remote-clear). It is always performed with a cmpxchg which 10432 - * guarantees that only a single thread will succeed: 10433 - * 10434 - * (TMB) cmpxchg to *pcpu_cid to mark UNSET 10435 - * 10436 - * Just to be clear, what we do _not_ want to happen is a transition to UNSET 10437 - * when a thread is actively using the cid (property (1)). 10438 - * 10439 - * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions. 10440 - * 10441 - * Scenario A) (TSA)+(TMA) (from next task perspective) 10442 - * 10443 - * CPU0 CPU1 10444 - * 10445 - * Context switch CS-1 Remote-clear 10446 - * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA) 10447 - * (implied barrier after cmpxchg) 10448 - * - switch_mm_cid() 10449 - * - memory barrier (see switch_mm_cid() 10450 - * comment explaining how this barrier 10451 - * is combined with other scheduler 10452 - * barriers) 10453 - * - mm_cid_get (next) 10454 - * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr) 10455 - * 10456 - * This Dekker ensures that either task (Y) is observed by the 10457 - * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are 10458 - * observed. 10459 - * 10460 - * If task (Y) store is observed by rcu_dereference(), it means that there is 10461 - * still an active task on the cpu. Remote-clear will therefore not transition 10462 - * to UNSET, which fulfills property (1). 10463 - * 10464 - * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(), 10465 - * it will move its state to UNSET, which clears the percpu cid perhaps 10466 - * uselessly (which is not an issue for correctness). Because task (Y) is not 10467 - * observed, CPU1 can move ahead to set the state to UNSET. Because moving 10468 - * state to UNSET is done with a cmpxchg expecting that the old state has the 10469 - * LAZY flag set, only one thread will successfully UNSET. 10470 - * 10471 - * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0 10472 - * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and 10473 - * CPU1 will observe task (Y) and do nothing more, which is fine. 10474 - * 10475 - * What we are effectively preventing with this Dekker is a scenario where 10476 - * neither LAZY flag nor store (Y) are observed, which would fail property (1) 10477 - * because this would UNSET a cid which is actively used. 10478 - */ 10479 - 10480 - void sched_mm_cid_migrate_from(struct task_struct *t) 10481 - { 10482 - t->migrate_from_cpu = task_cpu(t); 10483 - } 10484 - 10485 - static 10486 - int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, 10487 - struct task_struct *t, 10488 - struct mm_cid *src_pcpu_cid) 10489 - { 10490 - struct mm_struct *mm = t->mm; 10491 - struct task_struct *src_task; 10492 - int src_cid, last_mm_cid; 10493 - 10494 - if (!mm) 10495 - return -1; 10496 - 10497 - last_mm_cid = t->last_mm_cid; 10498 - /* 10499 - * If the migrated task has no last cid, or if the current 10500 - * task on src rq uses the cid, it means the source cid does not need 10501 - * to be moved to the destination cpu. 10502 - */ 10503 - if (last_mm_cid == -1) 10504 - return -1; 10505 - src_cid = READ_ONCE(src_pcpu_cid->cid); 10506 - if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid) 10507 - return -1; 10508 - 10509 - /* 10510 - * If we observe an active task using the mm on this rq, it means we 10511 - * are not the last task to be migrated from this cpu for this mm, so 10512 - * there is no need to move src_cid to the destination cpu. 10513 - */ 10514 - guard(rcu)(); 10515 - src_task = rcu_dereference(src_rq->curr); 10516 - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { 10517 - t->last_mm_cid = -1; 10518 - return -1; 10519 - } 10520 - 10521 - return src_cid; 10522 - } 10523 - 10524 - static 10525 - int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, 10526 - struct task_struct *t, 10527 - struct mm_cid *src_pcpu_cid, 10528 - int src_cid) 10529 - { 10530 - struct task_struct *src_task; 10531 - struct mm_struct *mm = t->mm; 10532 - int lazy_cid; 10533 - 10534 - if (src_cid == -1) 10535 - return -1; 10536 - 10537 - /* 10538 - * Attempt to clear the source cpu cid to move it to the destination 10539 - * cpu. 10540 - */ 10541 - lazy_cid = mm_cid_set_lazy_put(src_cid); 10542 - if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid)) 10543 - return -1; 10544 - 10545 - /* 10546 - * The implicit barrier after cmpxchg per-mm/cpu cid before loading 10547 - * rq->curr->mm matches the scheduler barrier in context_switch() 10548 - * between store to rq->curr and load of prev and next task's 10549 - * per-mm/cpu cid. 10550 - * 10551 - * The implicit barrier after cmpxchg per-mm/cpu cid before loading 10552 - * rq->curr->mm_cid_active matches the barrier in 10553 - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and 10554 - * sched_mm_cid_after_execve() between store to t->mm_cid_active and 10555 - * load of per-mm/cpu cid. 10556 - */ 10557 - 10558 - /* 10559 - * If we observe an active task using the mm on this rq after setting 10560 - * the lazy-put flag, this task will be responsible for transitioning 10561 - * from lazy-put flag set to MM_CID_UNSET. 10562 - */ 10563 - scoped_guard (rcu) { 10564 - src_task = rcu_dereference(src_rq->curr); 10565 - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { 10566 - /* 10567 - * We observed an active task for this mm, there is therefore 10568 - * no point in moving this cid to the destination cpu. 10569 - */ 10570 - t->last_mm_cid = -1; 10571 - return -1; 10572 - } 10573 - } 10574 - 10575 - /* 10576 - * The src_cid is unused, so it can be unset. 10577 - */ 10578 - if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) 10579 - return -1; 10580 - WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET); 10581 - return src_cid; 10582 - } 10583 - 10584 - /* 10585 - * Migration to dst cpu. Called with dst_rq lock held. 10586 - * Interrupts are disabled, which keeps the window of cid ownership without the 10587 - * source rq lock held small. 10588 - */ 10589 - void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) 10590 - { 10591 - struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; 10592 - struct mm_struct *mm = t->mm; 10593 - int src_cid, src_cpu; 10594 - bool dst_cid_is_set; 10595 - struct rq *src_rq; 10596 - 10597 - lockdep_assert_rq_held(dst_rq); 10598 - 10599 - if (!mm) 10600 - return; 10601 - src_cpu = t->migrate_from_cpu; 10602 - if (src_cpu == -1) { 10603 - t->last_mm_cid = -1; 10604 - return; 10605 - } 10606 - /* 10607 - * Move the src cid if the dst cid is unset. This keeps id 10608 - * allocation closest to 0 in cases where few threads migrate around 10609 - * many CPUs. 10610 - * 10611 - * If destination cid or recent cid is already set, we may have 10612 - * to just clear the src cid to ensure compactness in frequent 10613 - * migrations scenarios. 10614 - * 10615 - * It is not useful to clear the src cid when the number of threads is 10616 - * greater or equal to the number of allowed CPUs, because user-space 10617 - * can expect that the number of allowed cids can reach the number of 10618 - * allowed CPUs. 10619 - */ 10620 - dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); 10621 - dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) || 10622 - !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid)); 10623 - if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed)) 10624 - return; 10625 - src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); 10626 - src_rq = cpu_rq(src_cpu); 10627 - src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid); 10628 - if (src_cid == -1) 10629 - return; 10630 - src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid, 10631 - src_cid); 10632 - if (src_cid == -1) 10633 - return; 10634 - if (dst_cid_is_set) { 10635 - __mm_cid_put(mm, src_cid); 10636 - return; 10637 - } 10638 - /* Move src_cid to dst cpu. */ 10639 - mm_cid_snapshot_time(dst_rq, mm); 10640 - WRITE_ONCE(dst_pcpu_cid->cid, src_cid); 10641 - WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid); 10642 - } 10643 - 10644 - static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, 10645 - int cpu) 10646 - { 10647 - struct rq *rq = cpu_rq(cpu); 10648 - struct task_struct *t; 10649 - int cid, lazy_cid; 10650 - 10651 - cid = READ_ONCE(pcpu_cid->cid); 10652 - if (!mm_cid_is_valid(cid)) 10653 - return; 10654 - 10655 - /* 10656 - * Clear the cpu cid if it is set to keep cid allocation compact. If 10657 - * there happens to be other tasks left on the source cpu using this 10658 - * mm, the next task using this mm will reallocate its cid on context 10659 - * switch. 10660 - */ 10661 - lazy_cid = mm_cid_set_lazy_put(cid); 10662 - if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid)) 10663 - return; 10664 - 10665 - /* 10666 - * The implicit barrier after cmpxchg per-mm/cpu cid before loading 10667 - * rq->curr->mm matches the scheduler barrier in context_switch() 10668 - * between store to rq->curr and load of prev and next task's 10669 - * per-mm/cpu cid. 10670 - * 10671 - * The implicit barrier after cmpxchg per-mm/cpu cid before loading 10672 - * rq->curr->mm_cid_active matches the barrier in 10673 - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and 10674 - * sched_mm_cid_after_execve() between store to t->mm_cid_active and 10675 - * load of per-mm/cpu cid. 10676 - */ 10677 - 10678 - /* 10679 - * If we observe an active task using the mm on this rq after setting 10680 - * the lazy-put flag, that task will be responsible for transitioning 10681 - * from lazy-put flag set to MM_CID_UNSET. 10682 - */ 10683 - scoped_guard (rcu) { 10684 - t = rcu_dereference(rq->curr); 10685 - if (READ_ONCE(t->mm_cid_active) && t->mm == mm) 10686 - return; 10687 - } 10688 - 10689 - /* 10690 - * The cid is unused, so it can be unset. 10691 - * Disable interrupts to keep the window of cid ownership without rq 10692 - * lock small. 10693 - */ 10694 - scoped_guard (irqsave) { 10695 - if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) 10696 - __mm_cid_put(mm, cid); 10697 - } 10698 - } 10699 - 10700 - static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) 10701 - { 10702 - struct rq *rq = cpu_rq(cpu); 10703 - struct mm_cid *pcpu_cid; 10704 - struct task_struct *curr; 10705 - u64 rq_clock; 10706 - 10707 - /* 10708 - * rq->clock load is racy on 32-bit but one spurious clear once in a 10709 - * while is irrelevant. 10710 - */ 10711 - rq_clock = READ_ONCE(rq->clock); 10712 - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); 10713 - 10714 - /* 10715 - * In order to take care of infrequently scheduled tasks, bump the time 10716 - * snapshot associated with this cid if an active task using the mm is 10717 - * observed on this rq. 10718 - */ 10719 - scoped_guard (rcu) { 10720 - curr = rcu_dereference(rq->curr); 10721 - if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { 10722 - WRITE_ONCE(pcpu_cid->time, rq_clock); 10723 - return; 10724 - } 10725 - } 10726 - 10727 - if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) 10728 - return; 10729 - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); 10730 - } 10731 - 10732 - static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu, 10733 - int weight) 10734 - { 10735 - struct mm_cid *pcpu_cid; 10736 - int cid; 10737 - 10738 - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); 10739 - cid = READ_ONCE(pcpu_cid->cid); 10740 - if (!mm_cid_is_valid(cid) || cid < weight) 10741 - return; 10742 - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); 10743 - } 10744 - 10745 - static void task_mm_cid_work(struct callback_head *work) 10746 - { 10747 - unsigned long now = jiffies, old_scan, next_scan; 10748 - struct task_struct *t = current; 10749 - struct cpumask *cidmask; 10750 - struct mm_struct *mm; 10751 - int weight, cpu; 10752 - 10753 - WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work)); 10754 - 10755 - work->next = work; /* Prevent double-add */ 10756 - if (t->flags & PF_EXITING) 10757 - return; 10758 - mm = t->mm; 10759 - if (!mm) 10760 - return; 10761 - old_scan = READ_ONCE(mm->mm_cid_next_scan); 10762 - next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY); 10763 - if (!old_scan) { 10764 - unsigned long res; 10765 - 10766 - res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan); 10767 - if (res != old_scan) 10768 - old_scan = res; 10769 - else 10770 - old_scan = next_scan; 10771 - } 10772 - if (time_before(now, old_scan)) 10773 - return; 10774 - if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan)) 10775 - return; 10776 - cidmask = mm_cidmask(mm); 10777 - /* Clear cids that were not recently used. */ 10778 - for_each_possible_cpu(cpu) 10779 - sched_mm_cid_remote_clear_old(mm, cpu); 10780 - weight = cpumask_weight(cidmask); 10781 - /* 10782 - * Clear cids that are greater or equal to the cidmask weight to 10783 - * recompact it. 10784 - */ 10785 - for_each_possible_cpu(cpu) 10786 - sched_mm_cid_remote_clear_weight(mm, cpu, weight); 10787 - } 10788 - 10789 - void init_sched_mm_cid(struct task_struct *t) 10790 - { 10791 - struct mm_struct *mm = t->mm; 10792 - int mm_users = 0; 10793 - 10794 - if (mm) { 10795 - mm_users = atomic_read(&mm->mm_users); 10796 - if (mm_users == 1) 10797 - mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY); 10798 - } 10799 - t->cid_work.next = &t->cid_work; /* Protect against double add */ 10800 - init_task_work(&t->cid_work, task_mm_cid_work); 10801 - } 10802 - 10803 - void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) 10804 - { 10805 - struct callback_head *work = &curr->cid_work; 10806 - unsigned long now = jiffies; 10807 - 10808 - if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || 10809 - work->next != work) 10810 - return; 10811 - if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) 10812 - return; 10813 - 10814 - /* No page allocation under rq lock */ 10815 - task_work_add(curr, work, TWA_RESUME); 10816 - } 10817 - 10818 10383 void sched_mm_cid_exit_signals(struct task_struct *t) 10819 10384 { 10820 10385 struct mm_struct *mm = t->mm; 10821 - struct rq *rq; 10822 10386 10823 - if (!mm) 10387 + if (!mm || !t->mm_cid_active) 10824 10388 return; 10825 10389 10826 - preempt_disable(); 10827 - rq = this_rq(); 10828 - guard(rq_lock_irqsave)(rq); 10829 - preempt_enable_no_resched(); /* holding spinlock */ 10830 - WRITE_ONCE(t->mm_cid_active, 0); 10831 - /* 10832 - * Store t->mm_cid_active before loading per-mm/cpu cid. 10833 - * Matches barrier in sched_mm_cid_remote_clear_old(). 10834 - */ 10835 - smp_mb(); 10836 - mm_cid_put(mm); 10837 - t->last_mm_cid = t->mm_cid = -1; 10390 + guard(preempt)(); 10391 + t->mm_cid_active = 0; 10392 + if (t->mm_cid != MM_CID_UNSET) { 10393 + cpumask_clear_cpu(t->mm_cid, mm_cidmask(mm)); 10394 + t->mm_cid = MM_CID_UNSET; 10395 + } 10838 10396 } 10839 10397 10398 + /* Deactivate MM CID allocation across execve() */ 10840 10399 void sched_mm_cid_before_execve(struct task_struct *t) 10841 10400 { 10842 - struct mm_struct *mm = t->mm; 10843 - struct rq *rq; 10844 - 10845 - if (!mm) 10846 - return; 10847 - 10848 - preempt_disable(); 10849 - rq = this_rq(); 10850 - guard(rq_lock_irqsave)(rq); 10851 - preempt_enable_no_resched(); /* holding spinlock */ 10852 - WRITE_ONCE(t->mm_cid_active, 0); 10853 - /* 10854 - * Store t->mm_cid_active before loading per-mm/cpu cid. 10855 - * Matches barrier in sched_mm_cid_remote_clear_old(). 10856 - */ 10857 - smp_mb(); 10858 - mm_cid_put(mm); 10859 - t->last_mm_cid = t->mm_cid = -1; 10401 + sched_mm_cid_exit_signals(t); 10860 10402 } 10861 10403 10404 + /* Reactivate MM CID after successful execve() */ 10862 10405 void sched_mm_cid_after_execve(struct task_struct *t) 10863 10406 { 10864 10407 struct mm_struct *mm = t->mm; 10865 - struct rq *rq; 10866 10408 10867 10409 if (!mm) 10868 10410 return; 10869 10411 10870 - preempt_disable(); 10871 - rq = this_rq(); 10872 - scoped_guard (rq_lock_irqsave, rq) { 10873 - preempt_enable_no_resched(); /* holding spinlock */ 10874 - WRITE_ONCE(t->mm_cid_active, 1); 10875 - /* 10876 - * Store t->mm_cid_active before loading per-mm/cpu cid. 10877 - * Matches barrier in sched_mm_cid_remote_clear_old(). 10878 - */ 10879 - smp_mb(); 10880 - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); 10881 - } 10412 + guard(preempt)(); 10413 + t->mm_cid_active = 1; 10414 + mm_cid_select(t); 10882 10415 } 10883 10416 10884 10417 void sched_mm_cid_fork(struct task_struct *t) 10885 10418 { 10886 - WARN_ON_ONCE(!t->mm || t->mm_cid != -1); 10419 + WARN_ON_ONCE(!t->mm || t->mm_cid != MM_CID_UNSET); 10887 10420 t->mm_cid_active = 1; 10888 10421 } 10889 10422 #endif /* CONFIG_SCHED_MM_CID */
+45 -248
kernel/sched/sched.h
··· 3540 3540 extern const char *preempt_modes[]; 3541 3541 3542 3542 #ifdef CONFIG_SCHED_MM_CID 3543 - 3544 - #define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ 3545 - #define MM_CID_SCAN_DELAY 100 /* 100ms */ 3546 - 3547 - extern raw_spinlock_t cid_lock; 3548 - extern int use_cid_lock; 3549 - 3550 - extern void sched_mm_cid_migrate_from(struct task_struct *t); 3551 - extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t); 3552 - extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr); 3553 - extern void init_sched_mm_cid(struct task_struct *t); 3554 - 3555 - static inline void __mm_cid_put(struct mm_struct *mm, int cid) 3556 - { 3557 - if (cid < 0) 3558 - return; 3559 - cpumask_clear_cpu(cid, mm_cidmask(mm)); 3560 - } 3561 - 3562 - /* 3563 - * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to 3564 - * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to 3565 - * be held to transition to other states. 3566 - * 3567 - * State transitions synchronized with cmpxchg or try_cmpxchg need to be 3568 - * consistent across CPUs, which prevents use of this_cpu_cmpxchg. 3569 - */ 3570 - static inline void mm_cid_put_lazy(struct task_struct *t) 3543 + static inline void init_sched_mm_cid(struct task_struct *t) 3571 3544 { 3572 3545 struct mm_struct *mm = t->mm; 3573 - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3574 - int cid; 3546 + unsigned int max_cid; 3575 3547 3576 - lockdep_assert_irqs_disabled(); 3577 - cid = __this_cpu_read(pcpu_cid->cid); 3578 - if (!mm_cid_is_lazy_put(cid) || 3579 - !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) 3548 + if (!mm) 3580 3549 return; 3581 - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3550 + 3551 + /* Preset last_mm_cid */ 3552 + max_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->mm_users)); 3553 + t->last_mm_cid = max_cid - 1; 3582 3554 } 3583 3555 3584 - static inline int mm_cid_pcpu_unset(struct mm_struct *mm) 3556 + static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigned int max_cids) 3585 3557 { 3586 - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3587 - int cid, res; 3558 + struct mm_struct *mm = t->mm; 3588 3559 3589 - lockdep_assert_irqs_disabled(); 3590 - cid = __this_cpu_read(pcpu_cid->cid); 3591 - for (;;) { 3592 - if (mm_cid_is_unset(cid)) 3593 - return MM_CID_UNSET; 3594 - /* 3595 - * Attempt transition from valid or lazy-put to unset. 3596 - */ 3597 - res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET); 3598 - if (res == cid) 3599 - break; 3600 - cid = res; 3601 - } 3602 - return cid; 3560 + if (cid >= max_cids) 3561 + return false; 3562 + if (cpumask_test_and_set_cpu(cid, mm_cidmask(mm))) 3563 + return false; 3564 + t->mm_cid = t->last_mm_cid = cid; 3565 + __this_cpu_write(mm->pcpu_cid->cid, cid); 3566 + return true; 3603 3567 } 3604 3568 3605 - static inline void mm_cid_put(struct mm_struct *mm) 3569 + static inline bool mm_cid_get(struct task_struct *t) 3606 3570 { 3607 - int cid; 3571 + struct mm_struct *mm = t->mm; 3572 + unsigned int max_cids; 3608 3573 3609 - lockdep_assert_irqs_disabled(); 3610 - cid = mm_cid_pcpu_unset(mm); 3611 - if (cid == MM_CID_UNSET) 3612 - return; 3613 - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3574 + max_cids = min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->mm_users)); 3575 + 3576 + /* Try to reuse the last CID of this task */ 3577 + if (__mm_cid_get(t, t->last_mm_cid, max_cids)) 3578 + return true; 3579 + 3580 + /* Try to reuse the last CID of this mm on this CPU */ 3581 + if (__mm_cid_get(t, __this_cpu_read(mm->pcpu_cid->cid), max_cids)) 3582 + return true; 3583 + 3584 + /* Try the first zero bit in the cidmask. */ 3585 + return __mm_cid_get(t, cpumask_first_zero(mm_cidmask(mm)), max_cids); 3614 3586 } 3615 3587 3616 - static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) 3588 + static inline void mm_cid_select(struct task_struct *t) 3617 3589 { 3618 - struct cpumask *cidmask = mm_cidmask(mm); 3619 - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3620 - int cid, max_nr_cid, allowed_max_nr_cid; 3621 - 3622 3590 /* 3623 - * After shrinking the number of threads or reducing the number 3624 - * of allowed cpus, reduce the value of max_nr_cid so expansion 3625 - * of cid allocation will preserve cache locality if the number 3626 - * of threads or allowed cpus increase again. 3627 - */ 3628 - max_nr_cid = atomic_read(&mm->max_nr_cid); 3629 - while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), 3630 - atomic_read(&mm->mm_users))), 3631 - max_nr_cid > allowed_max_nr_cid) { 3632 - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ 3633 - if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) { 3634 - max_nr_cid = allowed_max_nr_cid; 3635 - break; 3636 - } 3637 - } 3638 - /* Try to re-use recent cid. This improves cache locality. */ 3639 - cid = __this_cpu_read(pcpu_cid->recent_cid); 3640 - if (!mm_cid_is_unset(cid) && cid < max_nr_cid && 3641 - !cpumask_test_and_set_cpu(cid, cidmask)) 3642 - return cid; 3643 - /* 3644 - * Expand cid allocation if the maximum number of concurrency 3645 - * IDs allocated (max_nr_cid) is below the number cpus allowed 3646 - * and number of threads. Expanding cid allocation as much as 3647 - * possible improves cache locality. 3648 - */ 3649 - cid = max_nr_cid; 3650 - while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { 3651 - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ 3652 - if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) 3653 - continue; 3654 - if (!cpumask_test_and_set_cpu(cid, cidmask)) 3655 - return cid; 3656 - } 3657 - /* 3658 - * Find the first available concurrency id. 3659 - * Retry finding first zero bit if the mask is temporarily 3660 - * filled. This only happens during concurrent remote-clear 3661 - * which owns a cid without holding a rq lock. 3591 + * mm_cid_get() can fail when the maximum CID, which is determined 3592 + * by min(mm->nr_cpus_allowed, mm->mm_users) changes concurrently. 3593 + * That's a transient failure as there cannot be more tasks 3594 + * concurrently on a CPU (or about to be scheduled in) than that. 3662 3595 */ 3663 3596 for (;;) { 3664 - cid = cpumask_first_zero(cidmask); 3665 - if (cid < READ_ONCE(mm->nr_cpus_allowed)) 3597 + if (mm_cid_get(t)) 3666 3598 break; 3667 - cpu_relax(); 3668 3599 } 3669 - if (cpumask_test_and_set_cpu(cid, cidmask)) 3670 - return -1; 3671 - 3672 - return cid; 3673 3600 } 3674 3601 3675 - /* 3676 - * Save a snapshot of the current runqueue time of this cpu 3677 - * with the per-cpu cid value, allowing to estimate how recently it was used. 3678 - */ 3679 - static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) 3602 + static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) 3680 3603 { 3681 - struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq)); 3682 - 3683 - lockdep_assert_rq_held(rq); 3684 - WRITE_ONCE(pcpu_cid->time, rq->clock); 3685 - } 3686 - 3687 - static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, 3688 - struct mm_struct *mm) 3689 - { 3690 - int cid; 3691 - 3692 - /* 3693 - * All allocations (even those using the cid_lock) are lock-free. If 3694 - * use_cid_lock is set, hold the cid_lock to perform cid allocation to 3695 - * guarantee forward progress. 3696 - */ 3697 - if (!READ_ONCE(use_cid_lock)) { 3698 - cid = __mm_cid_try_get(t, mm); 3699 - if (cid >= 0) 3700 - goto end; 3701 - raw_spin_lock(&cid_lock); 3702 - } else { 3703 - raw_spin_lock(&cid_lock); 3704 - cid = __mm_cid_try_get(t, mm); 3705 - if (cid >= 0) 3706 - goto unlock; 3707 - } 3708 - 3709 - /* 3710 - * cid concurrently allocated. Retry while forcing following 3711 - * allocations to use the cid_lock to ensure forward progress. 3712 - */ 3713 - WRITE_ONCE(use_cid_lock, 1); 3714 - /* 3715 - * Set use_cid_lock before allocation. Only care about program order 3716 - * because this is only required for forward progress. 3717 - */ 3718 - barrier(); 3719 - /* 3720 - * Retry until it succeeds. It is guaranteed to eventually succeed once 3721 - * all newcoming allocations observe the use_cid_lock flag set. 3722 - */ 3723 - do { 3724 - cid = __mm_cid_try_get(t, mm); 3725 - cpu_relax(); 3726 - } while (cid < 0); 3727 - /* 3728 - * Allocate before clearing use_cid_lock. Only care about 3729 - * program order because this is for forward progress. 3730 - */ 3731 - barrier(); 3732 - WRITE_ONCE(use_cid_lock, 0); 3733 - unlock: 3734 - raw_spin_unlock(&cid_lock); 3735 - end: 3736 - mm_cid_snapshot_time(rq, mm); 3737 - 3738 - return cid; 3739 - } 3740 - 3741 - static inline int mm_cid_get(struct rq *rq, struct task_struct *t, 3742 - struct mm_struct *mm) 3743 - { 3744 - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3745 - int cid; 3746 - 3747 - lockdep_assert_rq_held(rq); 3748 - cid = __this_cpu_read(pcpu_cid->cid); 3749 - if (mm_cid_is_valid(cid)) { 3750 - mm_cid_snapshot_time(rq, mm); 3751 - return cid; 3752 - } 3753 - if (mm_cid_is_lazy_put(cid)) { 3754 - if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) 3755 - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3756 - } 3757 - cid = __mm_cid_get(rq, t, mm); 3758 - __this_cpu_write(pcpu_cid->cid, cid); 3759 - __this_cpu_write(pcpu_cid->recent_cid, cid); 3760 - 3761 - return cid; 3762 - } 3763 - 3764 - static inline void switch_mm_cid(struct rq *rq, 3765 - struct task_struct *prev, 3766 - struct task_struct *next) 3767 - { 3768 - /* 3769 - * Provide a memory barrier between rq->curr store and load of 3770 - * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition. 3771 - * 3772 - * Should be adapted if context_switch() is modified. 3773 - */ 3774 - if (!next->mm) { // to kernel 3775 - /* 3776 - * user -> kernel transition does not guarantee a barrier, but 3777 - * we can use the fact that it performs an atomic operation in 3778 - * mmgrab(). 3779 - */ 3780 - if (prev->mm) // from user 3781 - smp_mb__after_mmgrab(); 3782 - /* 3783 - * kernel -> kernel transition does not change rq->curr->mm 3784 - * state. It stays NULL. 3785 - */ 3786 - } else { // to user 3787 - /* 3788 - * kernel -> user transition does not provide a barrier 3789 - * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. 3790 - * Provide it here. 3791 - */ 3792 - if (!prev->mm) { // from kernel 3793 - smp_mb(); 3794 - } else { // from user 3795 - /* 3796 - * user->user transition relies on an implicit 3797 - * memory barrier in switch_mm() when 3798 - * current->mm changes. If the architecture 3799 - * switch_mm() does not have an implicit memory 3800 - * barrier, it is emitted here. If current->mm 3801 - * is unchanged, no barrier is needed. 3802 - */ 3803 - smp_mb__after_switch_mm(); 3804 - } 3805 - } 3806 3604 if (prev->mm_cid_active) { 3807 - mm_cid_snapshot_time(rq, prev->mm); 3808 - mm_cid_put_lazy(prev); 3809 - prev->mm_cid = -1; 3605 + if (prev->mm_cid != MM_CID_UNSET) 3606 + cpumask_clear_cpu(prev->mm_cid, mm_cidmask(prev->mm)); 3607 + prev->mm_cid = MM_CID_UNSET; 3810 3608 } 3609 + 3811 3610 if (next->mm_cid_active) { 3812 - next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); 3611 + mm_cid_select(next); 3813 3612 rseq_sched_set_task_mm_cid(next, next->mm_cid); 3814 3613 } 3815 3614 } 3816 3615 3817 3616 #else /* !CONFIG_SCHED_MM_CID: */ 3818 - static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } 3819 - static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } 3820 - static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } 3821 - static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } 3822 3617 static inline void init_sched_mm_cid(struct task_struct *t) { } 3618 + static inline void mm_cid_select(struct task_struct *t) { } 3619 + static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { } 3823 3620 #endif /* !CONFIG_SCHED_MM_CID */ 3824 3621 3825 3622 extern u64 avg_vruntime(struct cfs_rq *cfs_rq);