Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Revert "sched: Improve scalability via 'CPU buddies', which withstand random perturbations"

This reverts commit 970e178985cadbca660feb02f4d2ee3a09f7fdda.

Nikolay Ulyanitsky reported thatthe 3.6-rc5 kernel has a 15-20%
performance drop on PostgreSQL 9.2 on his machine (running "pgbench").

Borislav Petkov was able to reproduce this, and bisected it to this
commit 970e178985ca ("sched: Improve scalability via 'CPU buddies' ...")
apparently because the new single-idle-buddy model simply doesn't find
idle CPU's to reschedule on aggressively enough.

Mike Galbraith suspects that it is likely due to the user-mode spinlocks
in PostgreSQL not reacting well to preemption, but we don't really know
the details - I'll just revert the commit for now.

There are hopefully other approaches to improve scheduler scalability
without it causing these kinds of downsides.

Reported-by: Nikolay Ulyanitsky <lystor@gmail.com>
Bisected-by: Borislav Petkov <bp@alien8.de>
Acked-by: Mike Galbraith <efault@gmx.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

+22 -46
-1
include/linux/sched.h
··· 954 954 unsigned int smt_gain; 955 955 int flags; /* See SD_* */ 956 956 int level; 957 - int idle_buddy; /* cpu assigned to select_idle_sibling() */ 958 957 959 958 /* Runtime fields. */ 960 959 unsigned long last_balance; /* init to jiffies. units in jiffies */
+1 -38
kernel/sched/core.c
··· 6014 6014 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 6015 6015 * allows us to avoid some pointer chasing select_idle_sibling(). 6016 6016 * 6017 - * Iterate domains and sched_groups downward, assigning CPUs to be 6018 - * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing 6019 - * due to random perturbation self canceling, ie sw buddies pull 6020 - * their counterpart to their CPU's hw counterpart. 6021 - * 6022 6017 * Also keep a unique ID per domain (we use the first cpu number in 6023 6018 * the cpumask of the domain), this allows us to quickly tell if 6024 6019 * two cpus are in the same cache domain, see cpus_share_cache(). ··· 6027 6032 int id = cpu; 6028 6033 6029 6034 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 6030 - if (sd) { 6031 - struct sched_domain *tmp = sd; 6032 - struct sched_group *sg, *prev; 6033 - bool right; 6034 - 6035 - /* 6036 - * Traverse to first CPU in group, and count hops 6037 - * to cpu from there, switching direction on each 6038 - * hop, never ever pointing the last CPU rightward. 6039 - */ 6040 - do { 6041 - id = cpumask_first(sched_domain_span(tmp)); 6042 - prev = sg = tmp->groups; 6043 - right = 1; 6044 - 6045 - while (cpumask_first(sched_group_cpus(sg)) != id) 6046 - sg = sg->next; 6047 - 6048 - while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) { 6049 - prev = sg; 6050 - sg = sg->next; 6051 - right = !right; 6052 - } 6053 - 6054 - /* A CPU went down, never point back to domain start. */ 6055 - if (right && cpumask_first(sched_group_cpus(sg->next)) == id) 6056 - right = false; 6057 - 6058 - sg = right ? sg->next : prev; 6059 - tmp->idle_buddy = cpumask_first(sched_group_cpus(sg)); 6060 - } while ((tmp = tmp->child)); 6061 - 6035 + if (sd) 6062 6036 id = cpumask_first(sched_domain_span(sd)); 6063 - } 6064 6037 6065 6038 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 6066 6039 per_cpu(sd_llc_id, cpu) = id;
+21 -7
kernel/sched/fair.c
··· 2637 2637 int cpu = smp_processor_id(); 2638 2638 int prev_cpu = task_cpu(p); 2639 2639 struct sched_domain *sd; 2640 + struct sched_group *sg; 2641 + int i; 2640 2642 2641 2643 /* 2642 2644 * If the task is going to be woken-up on this cpu and if it is ··· 2655 2653 return prev_cpu; 2656 2654 2657 2655 /* 2658 - * Otherwise, check assigned siblings to find an elegible idle cpu. 2656 + * Otherwise, iterate the domains and find an elegible idle cpu. 2659 2657 */ 2660 2658 sd = rcu_dereference(per_cpu(sd_llc, target)); 2661 - 2662 2659 for_each_lower_domain(sd) { 2663 - if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) 2664 - continue; 2665 - if (idle_cpu(sd->idle_buddy)) 2666 - return sd->idle_buddy; 2667 - } 2660 + sg = sd->groups; 2661 + do { 2662 + if (!cpumask_intersects(sched_group_cpus(sg), 2663 + tsk_cpus_allowed(p))) 2664 + goto next; 2668 2665 2666 + for_each_cpu(i, sched_group_cpus(sg)) { 2667 + if (!idle_cpu(i)) 2668 + goto next; 2669 + } 2670 + 2671 + target = cpumask_first_and(sched_group_cpus(sg), 2672 + tsk_cpus_allowed(p)); 2673 + goto done; 2674 + next: 2675 + sg = sg->next; 2676 + } while (sg != sd->groups); 2677 + } 2678 + done: 2669 2679 return target; 2670 2680 } 2671 2681