sched/mmcid: Revert the complex CID management

+2 -51

include/linux/mm_types.h

··· 922 922 #define vma_policy(vma) NULL 923 923 #endif 924 924 925 - #ifdef CONFIG_SCHED_MM_CID 926 925 struct mm_cid { 927 - u64 time; 928 - int cid; 929 - int recent_cid; 926 + unsigned int cid; 930 927 }; 931 - #endif 932 928 933 929 /* 934 930 * Opaque type representing current mm_struct flag state. Must be accessed via ··· 996 1000 * runqueue locks. 997 1001 */ 998 1002 struct mm_cid __percpu *pcpu_cid; 999 - /* 1000 - * @mm_cid_next_scan: Next mm_cid scan (in jiffies). 1001 - * 1002 - * When the next mm_cid scan is due (in jiffies). 1003 - */ 1004 - unsigned long mm_cid_next_scan; 1005 1003 /** 1006 1004 * @nr_cpus_allowed: Number of CPUs allowed for mm. 1007 1005 * ··· 1003 1013 * threads allowed CPUs. 1004 1014 */ 1005 1015 unsigned int nr_cpus_allowed; 1006 - /** 1007 - * @max_nr_cid: Maximum number of allowed concurrency 1008 - * IDs allocated. 1009 - * 1010 - * Track the highest number of allowed concurrency IDs 1011 - * allocated for the mm. 1012 - */ 1013 - atomic_t max_nr_cid; 1014 1016 /** 1015 1017 * @cpus_allowed_lock: Lock protecting mm cpus_allowed. 1016 1018 * ··· 1353 1371 1354 1372 #ifdef CONFIG_SCHED_MM_CID 1355 1373 1356 - enum mm_cid_state { 1357 - MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */ 1358 - MM_CID_LAZY_PUT = (1U << 31), 1359 - }; 1360 - 1361 - static inline bool mm_cid_is_unset(int cid) 1362 - { 1363 - return cid == MM_CID_UNSET; 1364 - } 1365 - 1366 - static inline bool mm_cid_is_lazy_put(int cid) 1367 - { 1368 - return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT); 1369 - } 1370 - 1371 - static inline bool mm_cid_is_valid(int cid) 1372 - { 1373 - return !(cid & MM_CID_LAZY_PUT); 1374 - } 1375 - 1376 - static inline int mm_cid_set_lazy_put(int cid) 1377 - { 1378 - return cid | MM_CID_LAZY_PUT; 1379 - } 1380 - 1381 - static inline int mm_cid_clear_lazy_put(int cid) 1382 - { 1383 - return cid & ~MM_CID_LAZY_PUT; 1384 - } 1374 + #define MM_CID_UNSET (~0U) 1385 1375 1386 1376 /* 1387 1377 * mm_cpus_allowed: Union of all mm's threads allowed CPUs. ··· 1386 1432 struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); 1387 1433 1388 1434 pcpu_cid->cid = MM_CID_UNSET; 1389 - pcpu_cid->recent_cid = MM_CID_UNSET; 1390 - pcpu_cid->time = 0; 1391 1435 } 1392 1436 mm->nr_cpus_allowed = p->nr_cpus_allowed; 1393 - atomic_set(&mm->max_nr_cid, 0); 1394 1437 raw_spin_lock_init(&mm->cpus_allowed_lock); 1395 1438 cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); 1396 1439 cpumask_clear(mm_cidmask(mm));

+2 -3

kernel/fork.c

··· 955 955 #endif 956 956 957 957 #ifdef CONFIG_SCHED_MM_CID 958 - tsk->mm_cid = -1; 959 - tsk->last_mm_cid = -1; 958 + tsk->mm_cid = MM_CID_UNSET; 959 + tsk->last_mm_cid = MM_CID_UNSET; 960 960 tsk->mm_cid_active = 0; 961 - tsk->migrate_from_cpu = -1; 962 961 #endif 963 962 return tsk; 964 963

+17 -500

kernel/sched/core.c

··· 2128 2128 { 2129 2129 if (task_on_rq_migrating(p)) 2130 2130 flags |= ENQUEUE_MIGRATED; 2131 - if (flags & ENQUEUE_MIGRATED) 2132 - sched_mm_cid_migrate_to(rq, p); 2133 2131 2134 2132 enqueue_task(rq, p, flags); 2135 2133 ··· 3327 3329 if (p->sched_class->migrate_task_rq) 3328 3330 p->sched_class->migrate_task_rq(p, new_cpu); 3329 3331 p->se.nr_migrations++; 3330 - sched_mm_cid_migrate_from(p); 3331 3332 perf_event_task_migrate(p); 3332 3333 } 3333 3334 ··· 5277 5280 * 5278 5281 * kernel -> user switch + mmdrop_lazy_tlb() active 5279 5282 * user -> user switch 5280 - * 5281 - * switch_mm_cid() needs to be updated if the barriers provided 5282 - * by context_switch() are modified. 5283 5283 */ 5284 5284 if (!next->mm) { // to kernel 5285 5285 enter_lazy_tlb(prev->active_mm, next); ··· 5306 5312 } 5307 5313 } 5308 5314 5309 - /* switch_mm_cid() requires the memory barriers above. */ 5310 - switch_mm_cid(rq, prev, next); 5315 + switch_mm_cid(prev, next); 5311 5316 5312 5317 /* 5313 5318 * Tell rseq that the task was scheduled in. Must be after ··· 5597 5604 resched_latency = cpu_resched_latency(rq); 5598 5605 calc_global_load_tick(rq); 5599 5606 sched_core_tick(rq); 5600 - task_tick_mm_cid(rq, donor); 5601 5607 scx_tick(rq); 5602 5608 5603 5609 rq_unlock(rq, &rf); ··· 10368 10376 } 10369 10377 10370 10378 #ifdef CONFIG_SCHED_MM_CID 10371 - 10372 10379 /* 10373 - * @cid_lock: Guarantee forward-progress of cid allocation. 10374 - * 10375 - * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock 10376 - * is only used when contention is detected by the lock-free allocation so 10377 - * forward progress can be guaranteed. 10380 + * When a task exits, the MM CID held by the task is not longer required as 10381 + * the task cannot return to user space. 10378 10382 */ 10379 - DEFINE_RAW_SPINLOCK(cid_lock); 10380 - 10381 - /* 10382 - * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock. 10383 - * 10384 - * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is 10385 - * detected, it is set to 1 to ensure that all newly coming allocations are 10386 - * serialized by @cid_lock until the allocation which detected contention 10387 - * completes and sets @use_cid_lock back to 0. This guarantees forward progress 10388 - * of a cid allocation. 10389 - */ 10390 - int use_cid_lock; 10391 - 10392 - /* 10393 - * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid 10394 - * concurrently with respect to the execution of the source runqueue context 10395 - * switch. 10396 - * 10397 - * There is one basic properties we want to guarantee here: 10398 - * 10399 - * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively 10400 - * used by a task. That would lead to concurrent allocation of the cid and 10401 - * userspace corruption. 10402 - * 10403 - * Provide this guarantee by introducing a Dekker memory ordering to guarantee 10404 - * that a pair of loads observe at least one of a pair of stores, which can be 10405 - * shown as: 10406 - * 10407 - * X = Y = 0 10408 - * 10409 - * w[X]=1 w[Y]=1 10410 - * MB MB 10411 - * r[Y]=y r[X]=x 10412 - * 10413 - * Which guarantees that x==0 && y==0 is impossible. But rather than using 10414 - * values 0 and 1, this algorithm cares about specific state transitions of the 10415 - * runqueue current task (as updated by the scheduler context switch), and the 10416 - * per-mm/cpu cid value. 10417 - * 10418 - * Let's introduce task (Y) which has task->mm == mm and task (N) which has 10419 - * task->mm != mm for the rest of the discussion. There are two scheduler state 10420 - * transitions on context switch we care about: 10421 - * 10422 - * (TSA) Store to rq->curr with transition from (N) to (Y) 10423 - * 10424 - * (TSB) Store to rq->curr with transition from (Y) to (N) 10425 - * 10426 - * On the remote-clear side, there is one transition we care about: 10427 - * 10428 - * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag 10429 - * 10430 - * There is also a transition to UNSET state which can be performed from all 10431 - * sides (scheduler, remote-clear). It is always performed with a cmpxchg which 10432 - * guarantees that only a single thread will succeed: 10433 - * 10434 - * (TMB) cmpxchg to *pcpu_cid to mark UNSET 10435 - * 10436 - * Just to be clear, what we do _not_ want to happen is a transition to UNSET 10437 - * when a thread is actively using the cid (property (1)). 10438 - * 10439 - * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions. 10440 - * 10441 - * Scenario A) (TSA)+(TMA) (from next task perspective) 10442 - * 10443 - * CPU0 CPU1 10444 - * 10445 - * Context switch CS-1 Remote-clear 10446 - * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA) 10447 - * (implied barrier after cmpxchg) 10448 - * - switch_mm_cid() 10449 - * - memory barrier (see switch_mm_cid() 10450 - * comment explaining how this barrier 10451 - * is combined with other scheduler 10452 - * barriers) 10453 - * - mm_cid_get (next) 10454 - * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr) 10455 - * 10456 - * This Dekker ensures that either task (Y) is observed by the 10457 - * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are 10458 - * observed. 10459 - * 10460 - * If task (Y) store is observed by rcu_dereference(), it means that there is 10461 - * still an active task on the cpu. Remote-clear will therefore not transition 10462 - * to UNSET, which fulfills property (1). 10463 - * 10464 - * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(), 10465 - * it will move its state to UNSET, which clears the percpu cid perhaps 10466 - * uselessly (which is not an issue for correctness). Because task (Y) is not 10467 - * observed, CPU1 can move ahead to set the state to UNSET. Because moving 10468 - * state to UNSET is done with a cmpxchg expecting that the old state has the 10469 - * LAZY flag set, only one thread will successfully UNSET. 10470 - * 10471 - * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0 10472 - * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and 10473 - * CPU1 will observe task (Y) and do nothing more, which is fine. 10474 - * 10475 - * What we are effectively preventing with this Dekker is a scenario where 10476 - * neither LAZY flag nor store (Y) are observed, which would fail property (1) 10477 - * because this would UNSET a cid which is actively used. 10478 - */ 10479 - 10480 - void sched_mm_cid_migrate_from(struct task_struct *t) 10481 - { 10482 - t->migrate_from_cpu = task_cpu(t); 10483 - } 10484 - 10485 - static 10486 - int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, 10487 - struct task_struct *t, 10488 - struct mm_cid *src_pcpu_cid) 10489 - { 10490 - struct mm_struct *mm = t->mm; 10491 - struct task_struct *src_task; 10492 - int src_cid, last_mm_cid; 10493 - 10494 - if (!mm) 10495 - return -1; 10496 - 10497 - last_mm_cid = t->last_mm_cid; 10498 - /* 10499 - * If the migrated task has no last cid, or if the current 10500 - * task on src rq uses the cid, it means the source cid does not need 10501 - * to be moved to the destination cpu. 10502 - */ 10503 - if (last_mm_cid == -1) 10504 - return -1; 10505 - src_cid = READ_ONCE(src_pcpu_cid->cid); 10506 - if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid) 10507 - return -1; 10508 - 10509 - /* 10510 - * If we observe an active task using the mm on this rq, it means we 10511 - * are not the last task to be migrated from this cpu for this mm, so 10512 - * there is no need to move src_cid to the destination cpu. 10513 - */ 10514 - guard(rcu)(); 10515 - src_task = rcu_dereference(src_rq->curr); 10516 - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { 10517 - t->last_mm_cid = -1; 10518 - return -1; 10519 - } 10520 - 10521 - return src_cid; 10522 - } 10523 - 10524 - static 10525 - int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, 10526 - struct task_struct *t, 10527 - struct mm_cid *src_pcpu_cid, 10528 - int src_cid) 10529 - { 10530 - struct task_struct *src_task; 10531 - struct mm_struct *mm = t->mm; 10532 - int lazy_cid; 10533 - 10534 - if (src_cid == -1) 10535 - return -1; 10536 - 10537 - /* 10538 - * Attempt to clear the source cpu cid to move it to the destination 10539 - * cpu. 10540 - */ 10541 - lazy_cid = mm_cid_set_lazy_put(src_cid); 10542 - if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid)) 10543 - return -1; 10544 - 10545 - /* 10546 - * The implicit barrier after cmpxchg per-mm/cpu cid before loading 10547 - * rq->curr->mm matches the scheduler barrier in context_switch() 10548 - * between store to rq->curr and load of prev and next task's 10549 - * per-mm/cpu cid. 10550 - * 10551 - * The implicit barrier after cmpxchg per-mm/cpu cid before loading 10552 - * rq->curr->mm_cid_active matches the barrier in 10553 - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and 10554 - * sched_mm_cid_after_execve() between store to t->mm_cid_active and 10555 - * load of per-mm/cpu cid. 10556 - */ 10557 - 10558 - /* 10559 - * If we observe an active task using the mm on this rq after setting 10560 - * the lazy-put flag, this task will be responsible for transitioning 10561 - * from lazy-put flag set to MM_CID_UNSET. 10562 - */ 10563 - scoped_guard (rcu) { 10564 - src_task = rcu_dereference(src_rq->curr); 10565 - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { 10566 - /* 10567 - * We observed an active task for this mm, there is therefore 10568 - * no point in moving this cid to the destination cpu. 10569 - */ 10570 - t->last_mm_cid = -1; 10571 - return -1; 10572 - } 10573 - } 10574 - 10575 - /* 10576 - * The src_cid is unused, so it can be unset. 10577 - */ 10578 - if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) 10579 - return -1; 10580 - WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET); 10581 - return src_cid; 10582 - } 10583 - 10584 - /* 10585 - * Migration to dst cpu. Called with dst_rq lock held. 10586 - * Interrupts are disabled, which keeps the window of cid ownership without the 10587 - * source rq lock held small. 10588 - */ 10589 - void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) 10590 - { 10591 - struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; 10592 - struct mm_struct *mm = t->mm; 10593 - int src_cid, src_cpu; 10594 - bool dst_cid_is_set; 10595 - struct rq *src_rq; 10596 - 10597 - lockdep_assert_rq_held(dst_rq); 10598 - 10599 - if (!mm) 10600 - return; 10601 - src_cpu = t->migrate_from_cpu; 10602 - if (src_cpu == -1) { 10603 - t->last_mm_cid = -1; 10604 - return; 10605 - } 10606 - /* 10607 - * Move the src cid if the dst cid is unset. This keeps id 10608 - * allocation closest to 0 in cases where few threads migrate around 10609 - * many CPUs. 10610 - * 10611 - * If destination cid or recent cid is already set, we may have 10612 - * to just clear the src cid to ensure compactness in frequent 10613 - * migrations scenarios. 10614 - * 10615 - * It is not useful to clear the src cid when the number of threads is 10616 - * greater or equal to the number of allowed CPUs, because user-space 10617 - * can expect that the number of allowed cids can reach the number of 10618 - * allowed CPUs. 10619 - */ 10620 - dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); 10621 - dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) || 10622 - !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid)); 10623 - if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed)) 10624 - return; 10625 - src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); 10626 - src_rq = cpu_rq(src_cpu); 10627 - src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid); 10628 - if (src_cid == -1) 10629 - return; 10630 - src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid, 10631 - src_cid); 10632 - if (src_cid == -1) 10633 - return; 10634 - if (dst_cid_is_set) { 10635 - __mm_cid_put(mm, src_cid); 10636 - return; 10637 - } 10638 - /* Move src_cid to dst cpu. */ 10639 - mm_cid_snapshot_time(dst_rq, mm); 10640 - WRITE_ONCE(dst_pcpu_cid->cid, src_cid); 10641 - WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid); 10642 - } 10643 - 10644 - static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, 10645 - int cpu) 10646 - { 10647 - struct rq *rq = cpu_rq(cpu); 10648 - struct task_struct *t; 10649 - int cid, lazy_cid; 10650 - 10651 - cid = READ_ONCE(pcpu_cid->cid); 10652 - if (!mm_cid_is_valid(cid)) 10653 - return; 10654 - 10655 - /* 10656 - * Clear the cpu cid if it is set to keep cid allocation compact. If 10657 - * there happens to be other tasks left on the source cpu using this 10658 - * mm, the next task using this mm will reallocate its cid on context 10659 - * switch. 10660 - */ 10661 - lazy_cid = mm_cid_set_lazy_put(cid); 10662 - if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid)) 10663 - return; 10664 - 10665 - /* 10666 - * The implicit barrier after cmpxchg per-mm/cpu cid before loading 10667 - * rq->curr->mm matches the scheduler barrier in context_switch() 10668 - * between store to rq->curr and load of prev and next task's 10669 - * per-mm/cpu cid. 10670 - * 10671 - * The implicit barrier after cmpxchg per-mm/cpu cid before loading 10672 - * rq->curr->mm_cid_active matches the barrier in 10673 - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and 10674 - * sched_mm_cid_after_execve() between store to t->mm_cid_active and 10675 - * load of per-mm/cpu cid. 10676 - */ 10677 - 10678 - /* 10679 - * If we observe an active task using the mm on this rq after setting 10680 - * the lazy-put flag, that task will be responsible for transitioning 10681 - * from lazy-put flag set to MM_CID_UNSET. 10682 - */ 10683 - scoped_guard (rcu) { 10684 - t = rcu_dereference(rq->curr); 10685 - if (READ_ONCE(t->mm_cid_active) && t->mm == mm) 10686 - return; 10687 - } 10688 - 10689 - /* 10690 - * The cid is unused, so it can be unset. 10691 - * Disable interrupts to keep the window of cid ownership without rq 10692 - * lock small. 10693 - */ 10694 - scoped_guard (irqsave) { 10695 - if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) 10696 - __mm_cid_put(mm, cid); 10697 - } 10698 - } 10699 - 10700 - static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) 10701 - { 10702 - struct rq *rq = cpu_rq(cpu); 10703 - struct mm_cid *pcpu_cid; 10704 - struct task_struct *curr; 10705 - u64 rq_clock; 10706 - 10707 - /* 10708 - * rq->clock load is racy on 32-bit but one spurious clear once in a 10709 - * while is irrelevant. 10710 - */ 10711 - rq_clock = READ_ONCE(rq->clock); 10712 - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); 10713 - 10714 - /* 10715 - * In order to take care of infrequently scheduled tasks, bump the time 10716 - * snapshot associated with this cid if an active task using the mm is 10717 - * observed on this rq. 10718 - */ 10719 - scoped_guard (rcu) { 10720 - curr = rcu_dereference(rq->curr); 10721 - if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { 10722 - WRITE_ONCE(pcpu_cid->time, rq_clock); 10723 - return; 10724 - } 10725 - } 10726 - 10727 - if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) 10728 - return; 10729 - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); 10730 - } 10731 - 10732 - static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu, 10733 - int weight) 10734 - { 10735 - struct mm_cid *pcpu_cid; 10736 - int cid; 10737 - 10738 - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); 10739 - cid = READ_ONCE(pcpu_cid->cid); 10740 - if (!mm_cid_is_valid(cid) || cid < weight) 10741 - return; 10742 - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); 10743 - } 10744 - 10745 - static void task_mm_cid_work(struct callback_head *work) 10746 - { 10747 - unsigned long now = jiffies, old_scan, next_scan; 10748 - struct task_struct *t = current; 10749 - struct cpumask *cidmask; 10750 - struct mm_struct *mm; 10751 - int weight, cpu; 10752 - 10753 - WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work)); 10754 - 10755 - work->next = work; /* Prevent double-add */ 10756 - if (t->flags & PF_EXITING) 10757 - return; 10758 - mm = t->mm; 10759 - if (!mm) 10760 - return; 10761 - old_scan = READ_ONCE(mm->mm_cid_next_scan); 10762 - next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY); 10763 - if (!old_scan) { 10764 - unsigned long res; 10765 - 10766 - res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan); 10767 - if (res != old_scan) 10768 - old_scan = res; 10769 - else 10770 - old_scan = next_scan; 10771 - } 10772 - if (time_before(now, old_scan)) 10773 - return; 10774 - if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan)) 10775 - return; 10776 - cidmask = mm_cidmask(mm); 10777 - /* Clear cids that were not recently used. */ 10778 - for_each_possible_cpu(cpu) 10779 - sched_mm_cid_remote_clear_old(mm, cpu); 10780 - weight = cpumask_weight(cidmask); 10781 - /* 10782 - * Clear cids that are greater or equal to the cidmask weight to 10783 - * recompact it. 10784 - */ 10785 - for_each_possible_cpu(cpu) 10786 - sched_mm_cid_remote_clear_weight(mm, cpu, weight); 10787 - } 10788 - 10789 - void init_sched_mm_cid(struct task_struct *t) 10790 - { 10791 - struct mm_struct *mm = t->mm; 10792 - int mm_users = 0; 10793 - 10794 - if (mm) { 10795 - mm_users = atomic_read(&mm->mm_users); 10796 - if (mm_users == 1) 10797 - mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY); 10798 - } 10799 - t->cid_work.next = &t->cid_work; /* Protect against double add */ 10800 - init_task_work(&t->cid_work, task_mm_cid_work); 10801 - } 10802 - 10803 - void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) 10804 - { 10805 - struct callback_head *work = &curr->cid_work; 10806 - unsigned long now = jiffies; 10807 - 10808 - if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || 10809 - work->next != work) 10810 - return; 10811 - if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) 10812 - return; 10813 - 10814 - /* No page allocation under rq lock */ 10815 - task_work_add(curr, work, TWA_RESUME); 10816 - } 10817 - 10818 10383 void sched_mm_cid_exit_signals(struct task_struct *t) 10819 10384 { 10820 10385 struct mm_struct *mm = t->mm; 10821 - struct rq *rq; 10822 10386 10823 - if (!mm) 10387 + if (!mm || !t->mm_cid_active) 10824 10388 return; 10825 10389 10826 - preempt_disable(); 10827 - rq = this_rq(); 10828 - guard(rq_lock_irqsave)(rq); 10829 - preempt_enable_no_resched(); /* holding spinlock */ 10830 - WRITE_ONCE(t->mm_cid_active, 0); 10831 - /* 10832 - * Store t->mm_cid_active before loading per-mm/cpu cid. 10833 - * Matches barrier in sched_mm_cid_remote_clear_old(). 10834 - */ 10835 - smp_mb(); 10836 - mm_cid_put(mm); 10837 - t->last_mm_cid = t->mm_cid = -1; 10390 + guard(preempt)(); 10391 + t->mm_cid_active = 0; 10392 + if (t->mm_cid != MM_CID_UNSET) { 10393 + cpumask_clear_cpu(t->mm_cid, mm_cidmask(mm)); 10394 + t->mm_cid = MM_CID_UNSET; 10395 + } 10838 10396 } 10839 10397 10398 + /* Deactivate MM CID allocation across execve() */ 10840 10399 void sched_mm_cid_before_execve(struct task_struct *t) 10841 10400 { 10842 - struct mm_struct *mm = t->mm; 10843 - struct rq *rq; 10844 - 10845 - if (!mm) 10846 - return; 10847 - 10848 - preempt_disable(); 10849 - rq = this_rq(); 10850 - guard(rq_lock_irqsave)(rq); 10851 - preempt_enable_no_resched(); /* holding spinlock */ 10852 - WRITE_ONCE(t->mm_cid_active, 0); 10853 - /* 10854 - * Store t->mm_cid_active before loading per-mm/cpu cid. 10855 - * Matches barrier in sched_mm_cid_remote_clear_old(). 10856 - */ 10857 - smp_mb(); 10858 - mm_cid_put(mm); 10859 - t->last_mm_cid = t->mm_cid = -1; 10401 + sched_mm_cid_exit_signals(t); 10860 10402 } 10861 10403 10404 + /* Reactivate MM CID after successful execve() */ 10862 10405 void sched_mm_cid_after_execve(struct task_struct *t) 10863 10406 { 10864 10407 struct mm_struct *mm = t->mm; 10865 - struct rq *rq; 10866 10408 10867 10409 if (!mm) 10868 10410 return; 10869 10411 10870 - preempt_disable(); 10871 - rq = this_rq(); 10872 - scoped_guard (rq_lock_irqsave, rq) { 10873 - preempt_enable_no_resched(); /* holding spinlock */ 10874 - WRITE_ONCE(t->mm_cid_active, 1); 10875 - /* 10876 - * Store t->mm_cid_active before loading per-mm/cpu cid. 10877 - * Matches barrier in sched_mm_cid_remote_clear_old(). 10878 - */ 10879 - smp_mb(); 10880 - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); 10881 - } 10412 + guard(preempt)(); 10413 + t->mm_cid_active = 1; 10414 + mm_cid_select(t); 10882 10415 } 10883 10416 10884 10417 void sched_mm_cid_fork(struct task_struct *t) 10885 10418 { 10886 - WARN_ON_ONCE(!t->mm || t->mm_cid != -1); 10419 + WARN_ON_ONCE(!t->mm || t->mm_cid != MM_CID_UNSET); 10887 10420 t->mm_cid_active = 1; 10888 10421 } 10889 10422 #endif /* CONFIG_SCHED_MM_CID */

+45 -248

kernel/sched/sched.h

··· 3540 3540 extern const char *preempt_modes[]; 3541 3541 3542 3542 #ifdef CONFIG_SCHED_MM_CID 3543 - 3544 - #define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ 3545 - #define MM_CID_SCAN_DELAY 100 /* 100ms */ 3546 - 3547 - extern raw_spinlock_t cid_lock; 3548 - extern int use_cid_lock; 3549 - 3550 - extern void sched_mm_cid_migrate_from(struct task_struct *t); 3551 - extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t); 3552 - extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr); 3553 - extern void init_sched_mm_cid(struct task_struct *t); 3554 - 3555 - static inline void __mm_cid_put(struct mm_struct *mm, int cid) 3556 - { 3557 - if (cid < 0) 3558 - return; 3559 - cpumask_clear_cpu(cid, mm_cidmask(mm)); 3560 - } 3561 - 3562 - /* 3563 - * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to 3564 - * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to 3565 - * be held to transition to other states. 3566 - * 3567 - * State transitions synchronized with cmpxchg or try_cmpxchg need to be 3568 - * consistent across CPUs, which prevents use of this_cpu_cmpxchg. 3569 - */ 3570 - static inline void mm_cid_put_lazy(struct task_struct *t) 3543 + static inline void init_sched_mm_cid(struct task_struct *t) 3571 3544 { 3572 3545 struct mm_struct *mm = t->mm; 3573 - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3574 - int cid; 3546 + unsigned int max_cid; 3575 3547 3576 - lockdep_assert_irqs_disabled(); 3577 - cid = __this_cpu_read(pcpu_cid->cid); 3578 - if (!mm_cid_is_lazy_put(cid) || 3579 - !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) 3548 + if (!mm) 3580 3549 return; 3581 - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3550 + 3551 + /* Preset last_mm_cid */ 3552 + max_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->mm_users)); 3553 + t->last_mm_cid = max_cid - 1; 3582 3554 } 3583 3555 3584 - static inline int mm_cid_pcpu_unset(struct mm_struct *mm) 3556 + static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigned int max_cids) 3585 3557 { 3586 - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3587 - int cid, res; 3558 + struct mm_struct *mm = t->mm; 3588 3559 3589 - lockdep_assert_irqs_disabled(); 3590 - cid = __this_cpu_read(pcpu_cid->cid); 3591 - for (;;) { 3592 - if (mm_cid_is_unset(cid)) 3593 - return MM_CID_UNSET; 3594 - /* 3595 - * Attempt transition from valid or lazy-put to unset. 3596 - */ 3597 - res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET); 3598 - if (res == cid) 3599 - break; 3600 - cid = res; 3601 - } 3602 - return cid; 3560 + if (cid >= max_cids) 3561 + return false; 3562 + if (cpumask_test_and_set_cpu(cid, mm_cidmask(mm))) 3563 + return false; 3564 + t->mm_cid = t->last_mm_cid = cid; 3565 + __this_cpu_write(mm->pcpu_cid->cid, cid); 3566 + return true; 3603 3567 } 3604 3568 3605 - static inline void mm_cid_put(struct mm_struct *mm) 3569 + static inline bool mm_cid_get(struct task_struct *t) 3606 3570 { 3607 - int cid; 3571 + struct mm_struct *mm = t->mm; 3572 + unsigned int max_cids; 3608 3573 3609 - lockdep_assert_irqs_disabled(); 3610 - cid = mm_cid_pcpu_unset(mm); 3611 - if (cid == MM_CID_UNSET) 3612 - return; 3613 - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3574 + max_cids = min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->mm_users)); 3575 + 3576 + /* Try to reuse the last CID of this task */ 3577 + if (__mm_cid_get(t, t->last_mm_cid, max_cids)) 3578 + return true; 3579 + 3580 + /* Try to reuse the last CID of this mm on this CPU */ 3581 + if (__mm_cid_get(t, __this_cpu_read(mm->pcpu_cid->cid), max_cids)) 3582 + return true; 3583 + 3584 + /* Try the first zero bit in the cidmask. */ 3585 + return __mm_cid_get(t, cpumask_first_zero(mm_cidmask(mm)), max_cids); 3614 3586 } 3615 3587 3616 - static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) 3588 + static inline void mm_cid_select(struct task_struct *t) 3617 3589 { 3618 - struct cpumask *cidmask = mm_cidmask(mm); 3619 - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3620 - int cid, max_nr_cid, allowed_max_nr_cid; 3621 - 3622 3590 /* 3623 - * After shrinking the number of threads or reducing the number 3624 - * of allowed cpus, reduce the value of max_nr_cid so expansion 3625 - * of cid allocation will preserve cache locality if the number 3626 - * of threads or allowed cpus increase again. 3627 - */ 3628 - max_nr_cid = atomic_read(&mm->max_nr_cid); 3629 - while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), 3630 - atomic_read(&mm->mm_users))), 3631 - max_nr_cid > allowed_max_nr_cid) { 3632 - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ 3633 - if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) { 3634 - max_nr_cid = allowed_max_nr_cid; 3635 - break; 3636 - } 3637 - } 3638 - /* Try to re-use recent cid. This improves cache locality. */ 3639 - cid = __this_cpu_read(pcpu_cid->recent_cid); 3640 - if (!mm_cid_is_unset(cid) && cid < max_nr_cid && 3641 - !cpumask_test_and_set_cpu(cid, cidmask)) 3642 - return cid; 3643 - /* 3644 - * Expand cid allocation if the maximum number of concurrency 3645 - * IDs allocated (max_nr_cid) is below the number cpus allowed 3646 - * and number of threads. Expanding cid allocation as much as 3647 - * possible improves cache locality. 3648 - */ 3649 - cid = max_nr_cid; 3650 - while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { 3651 - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ 3652 - if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) 3653 - continue; 3654 - if (!cpumask_test_and_set_cpu(cid, cidmask)) 3655 - return cid; 3656 - } 3657 - /* 3658 - * Find the first available concurrency id. 3659 - * Retry finding first zero bit if the mask is temporarily 3660 - * filled. This only happens during concurrent remote-clear 3661 - * which owns a cid without holding a rq lock. 3591 + * mm_cid_get() can fail when the maximum CID, which is determined 3592 + * by min(mm->nr_cpus_allowed, mm->mm_users) changes concurrently. 3593 + * That's a transient failure as there cannot be more tasks 3594 + * concurrently on a CPU (or about to be scheduled in) than that. 3662 3595 */ 3663 3596 for (;;) { 3664 - cid = cpumask_first_zero(cidmask); 3665 - if (cid < READ_ONCE(mm->nr_cpus_allowed)) 3597 + if (mm_cid_get(t)) 3666 3598 break; 3667 - cpu_relax(); 3668 3599 } 3669 - if (cpumask_test_and_set_cpu(cid, cidmask)) 3670 - return -1; 3671 - 3672 - return cid; 3673 3600 } 3674 3601 3675 - /* 3676 - * Save a snapshot of the current runqueue time of this cpu 3677 - * with the per-cpu cid value, allowing to estimate how recently it was used. 3678 - */ 3679 - static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) 3602 + static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) 3680 3603 { 3681 - struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq)); 3682 - 3683 - lockdep_assert_rq_held(rq); 3684 - WRITE_ONCE(pcpu_cid->time, rq->clock); 3685 - } 3686 - 3687 - static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, 3688 - struct mm_struct *mm) 3689 - { 3690 - int cid; 3691 - 3692 - /* 3693 - * All allocations (even those using the cid_lock) are lock-free. If 3694 - * use_cid_lock is set, hold the cid_lock to perform cid allocation to 3695 - * guarantee forward progress. 3696 - */ 3697 - if (!READ_ONCE(use_cid_lock)) { 3698 - cid = __mm_cid_try_get(t, mm); 3699 - if (cid >= 0) 3700 - goto end; 3701 - raw_spin_lock(&cid_lock); 3702 - } else { 3703 - raw_spin_lock(&cid_lock); 3704 - cid = __mm_cid_try_get(t, mm); 3705 - if (cid >= 0) 3706 - goto unlock; 3707 - } 3708 - 3709 - /* 3710 - * cid concurrently allocated. Retry while forcing following 3711 - * allocations to use the cid_lock to ensure forward progress. 3712 - */ 3713 - WRITE_ONCE(use_cid_lock, 1); 3714 - /* 3715 - * Set use_cid_lock before allocation. Only care about program order 3716 - * because this is only required for forward progress. 3717 - */ 3718 - barrier(); 3719 - /* 3720 - * Retry until it succeeds. It is guaranteed to eventually succeed once 3721 - * all newcoming allocations observe the use_cid_lock flag set. 3722 - */ 3723 - do { 3724 - cid = __mm_cid_try_get(t, mm); 3725 - cpu_relax(); 3726 - } while (cid < 0); 3727 - /* 3728 - * Allocate before clearing use_cid_lock. Only care about 3729 - * program order because this is for forward progress. 3730 - */ 3731 - barrier(); 3732 - WRITE_ONCE(use_cid_lock, 0); 3733 - unlock: 3734 - raw_spin_unlock(&cid_lock); 3735 - end: 3736 - mm_cid_snapshot_time(rq, mm); 3737 - 3738 - return cid; 3739 - } 3740 - 3741 - static inline int mm_cid_get(struct rq *rq, struct task_struct *t, 3742 - struct mm_struct *mm) 3743 - { 3744 - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3745 - int cid; 3746 - 3747 - lockdep_assert_rq_held(rq); 3748 - cid = __this_cpu_read(pcpu_cid->cid); 3749 - if (mm_cid_is_valid(cid)) { 3750 - mm_cid_snapshot_time(rq, mm); 3751 - return cid; 3752 - } 3753 - if (mm_cid_is_lazy_put(cid)) { 3754 - if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) 3755 - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3756 - } 3757 - cid = __mm_cid_get(rq, t, mm); 3758 - __this_cpu_write(pcpu_cid->cid, cid); 3759 - __this_cpu_write(pcpu_cid->recent_cid, cid); 3760 - 3761 - return cid; 3762 - } 3763 - 3764 - static inline void switch_mm_cid(struct rq *rq, 3765 - struct task_struct *prev, 3766 - struct task_struct *next) 3767 - { 3768 - /* 3769 - * Provide a memory barrier between rq->curr store and load of 3770 - * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition. 3771 - * 3772 - * Should be adapted if context_switch() is modified. 3773 - */ 3774 - if (!next->mm) { // to kernel 3775 - /* 3776 - * user -> kernel transition does not guarantee a barrier, but 3777 - * we can use the fact that it performs an atomic operation in 3778 - * mmgrab(). 3779 - */ 3780 - if (prev->mm) // from user 3781 - smp_mb__after_mmgrab(); 3782 - /* 3783 - * kernel -> kernel transition does not change rq->curr->mm 3784 - * state. It stays NULL. 3785 - */ 3786 - } else { // to user 3787 - /* 3788 - * kernel -> user transition does not provide a barrier 3789 - * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. 3790 - * Provide it here. 3791 - */ 3792 - if (!prev->mm) { // from kernel 3793 - smp_mb(); 3794 - } else { // from user 3795 - /* 3796 - * user->user transition relies on an implicit 3797 - * memory barrier in switch_mm() when 3798 - * current->mm changes. If the architecture 3799 - * switch_mm() does not have an implicit memory 3800 - * barrier, it is emitted here. If current->mm 3801 - * is unchanged, no barrier is needed. 3802 - */ 3803 - smp_mb__after_switch_mm(); 3804 - } 3805 - } 3806 3604 if (prev->mm_cid_active) { 3807 - mm_cid_snapshot_time(rq, prev->mm); 3808 - mm_cid_put_lazy(prev); 3809 - prev->mm_cid = -1; 3605 + if (prev->mm_cid != MM_CID_UNSET) 3606 + cpumask_clear_cpu(prev->mm_cid, mm_cidmask(prev->mm)); 3607 + prev->mm_cid = MM_CID_UNSET; 3810 3608 } 3609 + 3811 3610 if (next->mm_cid_active) { 3812 - next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); 3611 + mm_cid_select(next); 3813 3612 rseq_sched_set_task_mm_cid(next, next->mm_cid); 3814 3613 } 3815 3614 } 3816 3615 3817 3616 #else /* !CONFIG_SCHED_MM_CID: */ 3818 - static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } 3819 - static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } 3820 - static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } 3821 - static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } 3822 3617 static inline void init_sched_mm_cid(struct task_struct *t) { } 3618 + static inline void mm_cid_select(struct task_struct *t) { } 3619 + static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { } 3823 3620 #endif /* !CONFIG_SCHED_MM_CID */ 3824 3621 3825 3622 extern u64 avg_vruntime(struct cfs_rq *cfs_rq);

Configure Feed

Configure Feed