Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar:
"Misc fixes all over the place:

- Fix NUMA over-balancing between lightly loaded nodes. This is
fallout of the big load-balancer rewrite.

- Fix the NOHZ remote loadavg update logic, which fixes anomalies
like reported 150 loadavg on mostly idle CPUs.

- Fix XFS performance/scalability

- Fix throttled groups unbound task-execution bug

- Fix PSI procfs boundary condition

- Fix the cpu.uclamp.{min,max} cgroup configuration write checks

- Fix DocBook annotations

- Fix RCU annotations

- Fix overly CPU-intensive housekeeper CPU logic loop on large CPU
counts"

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/fair: Fix kernel-doc warning in attach_entity_load_avg()
sched/core: Annotate curr pointer in rq with __rcu
sched/psi: Fix OOB write when writing 0 bytes to PSI files
sched/fair: Allow a per-CPU kthread waking a task to stack on the same CPU, to fix XFS performance regression
sched/fair: Prevent unlimited runtime on throttled group
sched/nohz: Optimize get_nohz_timer_target()
sched/uclamp: Reject negative values in cpu_uclamp_write()
sched/fair: Allow a small load imbalance between low utilisation SD_NUMA domains
timers/nohz: Update NOHZ load in remote tick
sched/core: Don't skip remote tick for idle CPUs

+119 -53
+2
include/linux/sched/nohz.h
··· 15 15 16 16 #ifdef CONFIG_NO_HZ_COMMON 17 17 void calc_load_nohz_start(void); 18 + void calc_load_nohz_remote(struct rq *rq); 18 19 void calc_load_nohz_stop(void); 19 20 #else 20 21 static inline void calc_load_nohz_start(void) { } 22 + static inline void calc_load_nohz_remote(struct rq *rq) { } 21 23 static inline void calc_load_nohz_stop(void) { } 22 24 #endif /* CONFIG_NO_HZ_COMMON */ 23 25
+34 -29
kernel/sched/core.c
··· 552 552 */ 553 553 int get_nohz_timer_target(void) 554 554 { 555 - int i, cpu = smp_processor_id(); 555 + int i, cpu = smp_processor_id(), default_cpu = -1; 556 556 struct sched_domain *sd; 557 557 558 - if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) 559 - return cpu; 558 + if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { 559 + if (!idle_cpu(cpu)) 560 + return cpu; 561 + default_cpu = cpu; 562 + } 560 563 561 564 rcu_read_lock(); 562 565 for_each_domain(cpu, sd) { 563 - for_each_cpu(i, sched_domain_span(sd)) { 566 + for_each_cpu_and(i, sched_domain_span(sd), 567 + housekeeping_cpumask(HK_FLAG_TIMER)) { 564 568 if (cpu == i) 565 569 continue; 566 570 567 - if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { 571 + if (!idle_cpu(i)) { 568 572 cpu = i; 569 573 goto unlock; 570 574 } 571 575 } 572 576 } 573 577 574 - if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) 575 - cpu = housekeeping_any_cpu(HK_FLAG_TIMER); 578 + if (default_cpu == -1) 579 + default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); 580 + cpu = default_cpu; 576 581 unlock: 577 582 rcu_read_unlock(); 578 583 return cpu; ··· 1446 1441 } 1447 1442 1448 1443 #ifdef CONFIG_SMP 1449 - 1450 - static inline bool is_per_cpu_kthread(struct task_struct *p) 1451 - { 1452 - if (!(p->flags & PF_KTHREAD)) 1453 - return false; 1454 - 1455 - if (p->nr_cpus_allowed != 1) 1456 - return false; 1457 - 1458 - return true; 1459 - } 1460 1444 1461 1445 /* 1462 1446 * Per-CPU kthreads are allowed to run on !active && online CPUs, see ··· 3663 3669 * statistics and checks timeslices in a time-independent way, regardless 3664 3670 * of when exactly it is running. 3665 3671 */ 3666 - if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) 3672 + if (!tick_nohz_tick_stopped_cpu(cpu)) 3667 3673 goto out_requeue; 3668 3674 3669 3675 rq_lock_irq(rq, &rf); 3670 3676 curr = rq->curr; 3671 - if (is_idle_task(curr) || cpu_is_offline(cpu)) 3677 + if (cpu_is_offline(cpu)) 3672 3678 goto out_unlock; 3673 3679 3680 + curr = rq->curr; 3674 3681 update_rq_clock(rq); 3675 - delta = rq_clock_task(rq) - curr->se.exec_start; 3676 3682 3677 - /* 3678 - * Make sure the next tick runs within a reasonable 3679 - * amount of time. 3680 - */ 3681 - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); 3683 + if (!is_idle_task(curr)) { 3684 + /* 3685 + * Make sure the next tick runs within a reasonable 3686 + * amount of time. 3687 + */ 3688 + delta = rq_clock_task(rq) - curr->se.exec_start; 3689 + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); 3690 + } 3682 3691 curr->sched_class->task_tick(rq, curr, 0); 3683 3692 3693 + calc_load_nohz_remote(rq); 3684 3694 out_unlock: 3685 3695 rq_unlock_irq(rq, &rf); 3686 - 3687 3696 out_requeue: 3697 + 3688 3698 /* 3689 3699 * Run the remote tick once per second (1Hz). This arbitrary 3690 3700 * frequency is large enough to avoid overload but short enough ··· 7061 7063 7062 7064 if (queued) 7063 7065 enqueue_task(rq, tsk, queue_flags); 7064 - if (running) 7066 + if (running) { 7065 7067 set_next_task(rq, tsk); 7068 + /* 7069 + * After changing group, the running task may have joined a 7070 + * throttled one but it's still the running task. Trigger a 7071 + * resched to make sure that task can still run. 7072 + */ 7073 + resched_curr(rq); 7074 + } 7066 7075 7067 7076 task_rq_unlock(rq, tsk, &rf); 7068 7077 } ··· 7265 7260 &req.percent); 7266 7261 if (req.ret) 7267 7262 return req; 7268 - if (req.percent > UCLAMP_PERCENT_SCALE) { 7263 + if ((u64)req.percent > UCLAMP_PERCENT_SCALE) { 7269 7264 req.ret = -ERANGE; 7270 7265 return req; 7271 7266 }
+43 -13
kernel/sched/fair.c
··· 3516 3516 * attach_entity_load_avg - attach this entity to its cfs_rq load avg 3517 3517 * @cfs_rq: cfs_rq to attach to 3518 3518 * @se: sched_entity to attach 3519 - * @flags: migration hints 3520 3519 * 3521 3520 * Must call update_cfs_rq_load_avg() before this, since we rely on 3522 3521 * cfs_rq->avg.last_update_time being current. ··· 5910 5911 if (prev != target && cpus_share_cache(prev, target) && 5911 5912 (available_idle_cpu(prev) || sched_idle_cpu(prev))) 5912 5913 return prev; 5914 + 5915 + /* 5916 + * Allow a per-cpu kthread to stack with the wakee if the 5917 + * kworker thread and the tasks previous CPUs are the same. 5918 + * The assumption is that the wakee queued work for the 5919 + * per-cpu kthread that is now complete and the wakeup is 5920 + * essentially a sync wakeup. An obvious example of this 5921 + * pattern is IO completions. 5922 + */ 5923 + if (is_per_cpu_kthread(current) && 5924 + prev == smp_processor_id() && 5925 + this_rq()->nr_running <= 1) { 5926 + return prev; 5927 + } 5913 5928 5914 5929 /* Check a recently used CPU as a potential idle candidate: */ 5915 5930 recent_used_cpu = p->recent_used_cpu; ··· 8671 8658 /* 8672 8659 * Try to use spare capacity of local group without overloading it or 8673 8660 * emptying busiest. 8674 - * XXX Spreading tasks across NUMA nodes is not always the best policy 8675 - * and special care should be taken for SD_NUMA domain level before 8676 - * spreading the tasks. For now, load_balance() fully relies on 8677 - * NUMA_BALANCING and fbq_classify_group/rq to override the decision. 8678 8661 */ 8679 8662 if (local->group_type == group_has_spare) { 8680 8663 if (busiest->group_type > group_fully_busy) { ··· 8710 8701 env->migration_type = migrate_task; 8711 8702 lsub_positive(&nr_diff, local->sum_nr_running); 8712 8703 env->imbalance = nr_diff >> 1; 8713 - return; 8704 + } else { 8705 + 8706 + /* 8707 + * If there is no overload, we just want to even the number of 8708 + * idle cpus. 8709 + */ 8710 + env->migration_type = migrate_task; 8711 + env->imbalance = max_t(long, 0, (local->idle_cpus - 8712 + busiest->idle_cpus) >> 1); 8714 8713 } 8715 8714 8716 - /* 8717 - * If there is no overload, we just want to even the number of 8718 - * idle cpus. 8719 - */ 8720 - env->migration_type = migrate_task; 8721 - env->imbalance = max_t(long, 0, (local->idle_cpus - 8722 - busiest->idle_cpus) >> 1); 8715 + /* Consider allowing a small imbalance between NUMA groups */ 8716 + if (env->sd->flags & SD_NUMA) { 8717 + unsigned int imbalance_min; 8718 + 8719 + /* 8720 + * Compute an allowed imbalance based on a simple 8721 + * pair of communicating tasks that should remain 8722 + * local and ignore them. 8723 + * 8724 + * NOTE: Generally this would have been based on 8725 + * the domain size and this was evaluated. However, 8726 + * the benefit is similar across a range of workloads 8727 + * and machines but scaling by the domain size adds 8728 + * the risk that lower domains have to be rebalanced. 8729 + */ 8730 + imbalance_min = 2; 8731 + if (busiest->sum_nr_running <= imbalance_min) 8732 + env->imbalance = 0; 8733 + } 8734 + 8723 8735 return; 8724 8736 } 8725 8737
+23 -10
kernel/sched/loadavg.c
··· 231 231 return calc_load_idx & 1; 232 232 } 233 233 234 - void calc_load_nohz_start(void) 234 + static void calc_load_nohz_fold(struct rq *rq) 235 235 { 236 - struct rq *this_rq = this_rq(); 237 236 long delta; 238 237 239 - /* 240 - * We're going into NO_HZ mode, if there's any pending delta, fold it 241 - * into the pending NO_HZ delta. 242 - */ 243 - delta = calc_load_fold_active(this_rq, 0); 238 + delta = calc_load_fold_active(rq, 0); 244 239 if (delta) { 245 240 int idx = calc_load_write_idx(); 246 241 247 242 atomic_long_add(delta, &calc_load_nohz[idx]); 248 243 } 244 + } 245 + 246 + void calc_load_nohz_start(void) 247 + { 248 + /* 249 + * We're going into NO_HZ mode, if there's any pending delta, fold it 250 + * into the pending NO_HZ delta. 251 + */ 252 + calc_load_nohz_fold(this_rq()); 253 + } 254 + 255 + /* 256 + * Keep track of the load for NOHZ_FULL, must be called between 257 + * calc_load_nohz_{start,stop}(). 258 + */ 259 + void calc_load_nohz_remote(struct rq *rq) 260 + { 261 + calc_load_nohz_fold(rq); 249 262 } 250 263 251 264 void calc_load_nohz_stop(void) ··· 281 268 this_rq->calc_load_update += LOAD_FREQ; 282 269 } 283 270 284 - static long calc_load_nohz_fold(void) 271 + static long calc_load_nohz_read(void) 285 272 { 286 273 int idx = calc_load_read_idx(); 287 274 long delta = 0; ··· 336 323 } 337 324 #else /* !CONFIG_NO_HZ_COMMON */ 338 325 339 - static inline long calc_load_nohz_fold(void) { return 0; } 326 + static inline long calc_load_nohz_read(void) { return 0; } 340 327 static inline void calc_global_nohz(void) { } 341 328 342 329 #endif /* CONFIG_NO_HZ_COMMON */ ··· 359 346 /* 360 347 * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs. 361 348 */ 362 - delta = calc_load_nohz_fold(); 349 + delta = calc_load_nohz_read(); 363 350 if (delta) 364 351 atomic_long_add(delta, &calc_load_tasks); 365 352
+3
kernel/sched/psi.c
··· 1199 1199 if (static_branch_likely(&psi_disabled)) 1200 1200 return -EOPNOTSUPP; 1201 1201 1202 + if (!nbytes) 1203 + return -EINVAL; 1204 + 1202 1205 buf_size = min(nbytes, sizeof(buf)); 1203 1206 if (copy_from_user(buf, user_buf, buf_size)) 1204 1207 return -EFAULT;
+14 -1
kernel/sched/sched.h
··· 896 896 */ 897 897 unsigned long nr_uninterruptible; 898 898 899 - struct task_struct *curr; 899 + struct task_struct __rcu *curr; 900 900 struct task_struct *idle; 901 901 struct task_struct *stop; 902 902 unsigned long next_balance; ··· 2477 2477 struct mm_struct *prev_mm, 2478 2478 struct mm_struct *next_mm) 2479 2479 { 2480 + } 2481 + #endif 2482 + 2483 + #ifdef CONFIG_SMP 2484 + static inline bool is_per_cpu_kthread(struct task_struct *p) 2485 + { 2486 + if (!(p->flags & PF_KTHREAD)) 2487 + return false; 2488 + 2489 + if (p->nr_cpus_allowed != 1) 2490 + return false; 2491 + 2492 + return true; 2480 2493 } 2481 2494 #endif