Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

-8

include/linux/sched/topology.h

··· 71 71 atomic_t ref; 72 72 atomic_t nr_busy_cpus; 73 73 int has_idle_cores; 74 - 75 - /* 76 - * Some variables from the most recent sd_lb_stats for this domain, 77 - * used by wake_affine(). 78 - */ 79 - unsigned long nr_running; 80 - unsigned long load; 81 - unsigned long capacity; 82 74 }; 83 75 84 76 struct sched_domain {

+46 -94

kernel/sched/fair.c

··· 5356 5356 return 1; 5357 5357 } 5358 5358 5359 - struct llc_stats { 5360 - unsigned long nr_running; 5361 - unsigned long load; 5362 - unsigned long capacity; 5363 - int has_capacity; 5364 - }; 5359 + /* 5360 + * The purpose of wake_affine() is to quickly determine on which CPU we can run 5361 + * soonest. For the purpose of speed we only consider the waking and previous 5362 + * CPU. 5363 + * 5364 + * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or 5365 + * will be) idle. 5366 + * 5367 + * wake_affine_weight() - considers the weight to reflect the average 5368 + * scheduling latency of the CPUs. This seems to work 5369 + * for the overloaded case. 5370 + */ 5365 5371 5366 - static bool get_llc_stats(struct llc_stats *stats, int cpu) 5372 + static bool 5373 + wake_affine_idle(struct sched_domain *sd, struct task_struct *p, 5374 + int this_cpu, int prev_cpu, int sync) 5367 5375 { 5368 - struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); 5376 + if (idle_cpu(this_cpu)) 5377 + return true; 5369 5378 5370 - if (!sds) 5371 - return false; 5379 + if (sync && cpu_rq(this_cpu)->nr_running == 1) 5380 + return true; 5372 5381 5373 - stats->nr_running = READ_ONCE(sds->nr_running); 5374 - stats->load = READ_ONCE(sds->load); 5375 - stats->capacity = READ_ONCE(sds->capacity); 5376 - stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu); 5377 - 5378 - return true; 5382 + return false; 5379 5383 } 5380 5384 5381 - /* 5382 - * Can a task be moved from prev_cpu to this_cpu without causing a load 5383 - * imbalance that would trigger the load balancer? 5384 - * 5385 - * Since we're running on 'stale' values, we might in fact create an imbalance 5386 - * but recomputing these values is expensive, as that'd mean iteration 2 cache 5387 - * domains worth of CPUs. 5388 - */ 5389 5385 static bool 5390 - wake_affine_llc(struct sched_domain *sd, struct task_struct *p, 5391 - int this_cpu, int prev_cpu, int sync) 5386 + wake_affine_weight(struct sched_domain *sd, struct task_struct *p, 5387 + int this_cpu, int prev_cpu, int sync) 5392 5388 { 5393 - struct llc_stats prev_stats, this_stats; 5394 5389 s64 this_eff_load, prev_eff_load; 5395 5390 unsigned long task_load; 5396 5391 5397 - if (!get_llc_stats(&prev_stats, prev_cpu) || 5398 - !get_llc_stats(&this_stats, this_cpu)) 5399 - return false; 5392 + this_eff_load = target_load(this_cpu, sd->wake_idx); 5393 + prev_eff_load = source_load(prev_cpu, sd->wake_idx); 5400 5394 5401 - /* 5402 - * If sync wakeup then subtract the (maximum possible) 5403 - * effect of the currently running task from the load 5404 - * of the current LLC. 5405 - */ 5406 5395 if (sync) { 5407 5396 unsigned long current_load = task_h_load(current); 5408 5397 5409 - /* in this case load hits 0 and this LLC is considered 'idle' */ 5410 - if (current_load > this_stats.load) 5398 + if (current_load > this_eff_load) 5411 5399 return true; 5412 5400 5413 - this_stats.load -= current_load; 5401 + this_eff_load -= current_load; 5414 5402 } 5415 5403 5416 - /* 5417 - * The has_capacity stuff is not SMT aware, but by trying to balance 5418 - * the nr_running on both ends we try and fill the domain at equal 5419 - * rates, thereby first consuming cores before siblings. 5420 - */ 5421 - 5422 - /* if the old cache has capacity, stay there */ 5423 - if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1) 5424 - return false; 5425 - 5426 - /* if this cache has capacity, come here */ 5427 - if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running) 5428 - return true; 5429 - 5430 - /* 5431 - * Check to see if we can move the load without causing too much 5432 - * imbalance. 5433 - */ 5434 5404 task_load = task_h_load(p); 5435 5405 5436 - this_eff_load = 100; 5437 - this_eff_load *= prev_stats.capacity; 5406 + this_eff_load += task_load; 5407 + if (sched_feat(WA_BIAS)) 5408 + this_eff_load *= 100; 5409 + this_eff_load *= capacity_of(prev_cpu); 5438 5410 5439 - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; 5440 - prev_eff_load *= this_stats.capacity; 5441 - 5442 - this_eff_load *= this_stats.load + task_load; 5443 - prev_eff_load *= prev_stats.load - task_load; 5411 + prev_eff_load -= task_load; 5412 + if (sched_feat(WA_BIAS)) 5413 + prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; 5414 + prev_eff_load *= capacity_of(this_cpu); 5444 5415 5445 5416 return this_eff_load <= prev_eff_load; 5446 5417 } ··· 5420 5449 int prev_cpu, int sync) 5421 5450 { 5422 5451 int this_cpu = smp_processor_id(); 5423 - bool affine; 5452 + bool affine = false; 5424 5453 5425 - /* 5426 - * Default to no affine wakeups; wake_affine() should not effect a task 5427 - * placement the load-balancer feels inclined to undo. The conservative 5428 - * option is therefore to not move tasks when they wake up. 5429 - */ 5430 - affine = false; 5454 + if (sched_feat(WA_IDLE) && !affine) 5455 + affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); 5431 5456 5432 - /* 5433 - * If the wakeup is across cache domains, try to evaluate if movement 5434 - * makes sense, otherwise rely on select_idle_siblings() to do 5435 - * placement inside the cache domain. 5436 - */ 5437 - if (!cpus_share_cache(prev_cpu, this_cpu)) 5438 - affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync); 5457 + if (sched_feat(WA_WEIGHT) && !affine) 5458 + affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); 5439 5459 5440 5460 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); 5441 5461 if (affine) { ··· 7562 7600 */ 7563 7601 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) 7564 7602 { 7565 - struct sched_domain_shared *shared = env->sd->shared; 7566 7603 struct sched_domain *child = env->sd->child; 7567 7604 struct sched_group *sg = env->sd->groups; 7568 7605 struct sg_lb_stats *local = &sds->local_stat; ··· 7633 7672 if (env->dst_rq->rd->overload != overload) 7634 7673 env->dst_rq->rd->overload = overload; 7635 7674 } 7636 - 7637 - if (!shared) 7638 - return; 7639 - 7640 - /* 7641 - * Since these are sums over groups they can contain some CPUs 7642 - * multiple times for the NUMA domains. 7643 - * 7644 - * Currently only wake_affine_llc() and find_busiest_group() 7645 - * uses these numbers, only the last is affected by this problem. 7646 - * 7647 - * XXX fix that. 7648 - */ 7649 - WRITE_ONCE(shared->nr_running, sds->total_running); 7650 - WRITE_ONCE(shared->load, sds->total_load); 7651 - WRITE_ONCE(shared->capacity, sds->total_capacity); 7652 7675 } 7653 7676 7654 7677 /** ··· 8041 8096 { 8042 8097 struct sched_group *sg = env->sd->groups; 8043 8098 int cpu, balance_cpu = -1; 8099 + 8100 + /* 8101 + * Ensure the balancing environment is consistent; can happen 8102 + * when the softirq triggers 'during' hotplug. 8103 + */ 8104 + if (!cpumask_test_cpu(env->dst_cpu, env->cpus)) 8105 + return 0; 8044 8106 8045 8107 /* 8046 8108 * In the newly idle case, we will allow all the cpu's

+3

kernel/sched/features.h

··· 81 81 SCHED_FEAT(LB_MIN, false) 82 82 SCHED_FEAT(ATTACH_AGE_LOAD, true) 83 83 84 + SCHED_FEAT(WA_IDLE, true) 85 + SCHED_FEAT(WA_WEIGHT, true) 86 + SCHED_FEAT(WA_BIAS, true)

Configure Feed

Configure Feed