Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

+10 -4

include/linux/sched.h

··· 844 844 #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ 845 845 #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ 846 846 #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ 847 + #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ 847 848 848 849 enum powersavings_balance_level { 849 850 POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ ··· 894 893 return 0; 895 894 } 896 895 897 - struct sched_group { 898 - struct sched_group *next; /* Must be a circular list */ 896 + struct sched_group_power { 899 897 atomic_t ref; 900 - 901 898 /* 902 899 * CPU power of this group, SCHED_LOAD_SCALE being max power for a 903 900 * single CPU. 904 901 */ 905 - unsigned int cpu_power, cpu_power_orig; 902 + unsigned int power, power_orig; 903 + }; 904 + 905 + struct sched_group { 906 + struct sched_group *next; /* Must be a circular list */ 907 + atomic_t ref; 908 + 906 909 unsigned int group_weight; 910 + struct sched_group_power *sgp; 907 911 908 912 /* 909 913 * The CPUs this group covers.

+155 -34

kernel/sched.c

··· 6557 6557 break; 6558 6558 } 6559 6559 6560 - if (!group->cpu_power) { 6560 + if (!group->sgp->power) { 6561 6561 printk(KERN_CONT "\n"); 6562 6562 printk(KERN_ERR "ERROR: domain->cpu_power not " 6563 6563 "set\n"); ··· 6581 6581 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 6582 6582 6583 6583 printk(KERN_CONT " %s", str); 6584 - if (group->cpu_power != SCHED_POWER_SCALE) { 6584 + if (group->sgp->power != SCHED_POWER_SCALE) { 6585 6585 printk(KERN_CONT " (cpu_power = %d)", 6586 - group->cpu_power); 6586 + group->sgp->power); 6587 6587 } 6588 6588 6589 6589 group = group->next; ··· 6774 6774 return rd; 6775 6775 } 6776 6776 6777 + static void free_sched_groups(struct sched_group *sg, int free_sgp) 6778 + { 6779 + struct sched_group *tmp, *first; 6780 + 6781 + if (!sg) 6782 + return; 6783 + 6784 + first = sg; 6785 + do { 6786 + tmp = sg->next; 6787 + 6788 + if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) 6789 + kfree(sg->sgp); 6790 + 6791 + kfree(sg); 6792 + sg = tmp; 6793 + } while (sg != first); 6794 + } 6795 + 6777 6796 static void free_sched_domain(struct rcu_head *rcu) 6778 6797 { 6779 6798 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 6780 - if (atomic_dec_and_test(&sd->groups->ref)) 6799 + 6800 + /* 6801 + * If its an overlapping domain it has private groups, iterate and 6802 + * nuke them all. 6803 + */ 6804 + if (sd->flags & SD_OVERLAP) { 6805 + free_sched_groups(sd->groups, 1); 6806 + } else if (atomic_dec_and_test(&sd->groups->ref)) { 6807 + kfree(sd->groups->sgp); 6781 6808 kfree(sd->groups); 6809 + } 6782 6810 kfree(sd); 6783 6811 } 6784 6812 ··· 6973 6945 struct sd_data { 6974 6946 struct sched_domain **__percpu sd; 6975 6947 struct sched_group **__percpu sg; 6948 + struct sched_group_power **__percpu sgp; 6976 6949 }; 6977 6950 6978 6951 struct s_data { ··· 6993 6964 typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); 6994 6965 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); 6995 6966 6967 + #define SDTL_OVERLAP 0x01 6968 + 6996 6969 struct sched_domain_topology_level { 6997 6970 sched_domain_init_f init; 6998 6971 sched_domain_mask_f mask; 6972 + int flags; 6999 6973 struct sd_data data; 7000 6974 }; 7001 6975 7002 - /* 7003 - * Assumes the sched_domain tree is fully constructed 7004 - */ 6976 + static int 6977 + build_overlap_sched_groups(struct sched_domain *sd, int cpu) 6978 + { 6979 + struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; 6980 + const struct cpumask *span = sched_domain_span(sd); 6981 + struct cpumask *covered = sched_domains_tmpmask; 6982 + struct sd_data *sdd = sd->private; 6983 + struct sched_domain *child; 6984 + int i; 6985 + 6986 + cpumask_clear(covered); 6987 + 6988 + for_each_cpu(i, span) { 6989 + struct cpumask *sg_span; 6990 + 6991 + if (cpumask_test_cpu(i, covered)) 6992 + continue; 6993 + 6994 + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6995 + GFP_KERNEL, cpu_to_node(i)); 6996 + 6997 + if (!sg) 6998 + goto fail; 6999 + 7000 + sg_span = sched_group_cpus(sg); 7001 + 7002 + child = *per_cpu_ptr(sdd->sd, i); 7003 + if (child->child) { 7004 + child = child->child; 7005 + cpumask_copy(sg_span, sched_domain_span(child)); 7006 + } else 7007 + cpumask_set_cpu(i, sg_span); 7008 + 7009 + cpumask_or(covered, covered, sg_span); 7010 + 7011 + sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); 7012 + atomic_inc(&sg->sgp->ref); 7013 + 7014 + if (cpumask_test_cpu(cpu, sg_span)) 7015 + groups = sg; 7016 + 7017 + if (!first) 7018 + first = sg; 7019 + if (last) 7020 + last->next = sg; 7021 + last = sg; 7022 + last->next = first; 7023 + } 7024 + sd->groups = groups; 7025 + 7026 + return 0; 7027 + 7028 + fail: 7029 + free_sched_groups(first, 0); 7030 + 7031 + return -ENOMEM; 7032 + } 7033 + 7005 7034 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) 7006 7035 { 7007 7036 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); ··· 7068 6981 if (child) 7069 6982 cpu = cpumask_first(sched_domain_span(child)); 7070 6983 7071 - if (sg) 6984 + if (sg) { 7072 6985 *sg = *per_cpu_ptr(sdd->sg, cpu); 6986 + (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); 6987 + atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ 6988 + } 7073 6989 7074 6990 return cpu; 7075 6991 } 7076 6992 7077 6993 /* 7078 - * build_sched_groups takes the cpumask we wish to span, and a pointer 7079 - * to a function which identifies what group(along with sched group) a CPU 7080 - * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids 7081 - * (due to the fact that we keep track of groups covered with a struct cpumask). 7082 - * 7083 6994 * build_sched_groups will build a circular linked list of the groups 7084 6995 * covered by the given span, and will set each group's ->cpumask correctly, 7085 6996 * and ->cpu_power to 0. 6997 + * 6998 + * Assumes the sched_domain tree is fully constructed 7086 6999 */ 7087 - static void 7088 - build_sched_groups(struct sched_domain *sd) 7000 + static int 7001 + build_sched_groups(struct sched_domain *sd, int cpu) 7089 7002 { 7090 7003 struct sched_group *first = NULL, *last = NULL; 7091 7004 struct sd_data *sdd = sd->private; 7092 7005 const struct cpumask *span = sched_domain_span(sd); 7093 7006 struct cpumask *covered; 7094 7007 int i; 7008 + 7009 + get_group(cpu, sdd, &sd->groups); 7010 + atomic_inc(&sd->groups->ref); 7011 + 7012 + if (cpu != cpumask_first(sched_domain_span(sd))) 7013 + return 0; 7095 7014 7096 7015 lockdep_assert_held(&sched_domains_mutex); 7097 7016 covered = sched_domains_tmpmask; ··· 7113 7020 continue; 7114 7021 7115 7022 cpumask_clear(sched_group_cpus(sg)); 7116 - sg->cpu_power = 0; 7023 + sg->sgp->power = 0; 7117 7024 7118 7025 for_each_cpu(j, span) { 7119 7026 if (get_group(j, sdd, NULL) != group) ··· 7130 7037 last = sg; 7131 7038 } 7132 7039 last->next = first; 7040 + 7041 + return 0; 7133 7042 } 7134 7043 7135 7044 /* ··· 7146 7051 */ 7147 7052 static void init_sched_groups_power(int cpu, struct sched_domain *sd) 7148 7053 { 7149 - WARN_ON(!sd || !sd->groups); 7054 + struct sched_group *sg = sd->groups; 7150 7055 7151 - if (cpu != group_first_cpu(sd->groups)) 7056 + WARN_ON(!sd || !sg); 7057 + 7058 + do { 7059 + sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 7060 + sg = sg->next; 7061 + } while (sg != sd->groups); 7062 + 7063 + if (cpu != group_first_cpu(sg)) 7152 7064 return; 7153 - 7154 - sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 7155 7065 7156 7066 update_group_power(sd, cpu); 7157 7067 } ··· 7277 7177 static void claim_allocations(int cpu, struct sched_domain *sd) 7278 7178 { 7279 7179 struct sd_data *sdd = sd->private; 7280 - struct sched_group *sg = sd->groups; 7281 7180 7282 7181 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 7283 7182 *per_cpu_ptr(sdd->sd, cpu) = NULL; 7284 7183 7285 - if (cpu == cpumask_first(sched_group_cpus(sg))) { 7286 - WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); 7184 + if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 7287 7185 *per_cpu_ptr(sdd->sg, cpu) = NULL; 7288 - } 7186 + 7187 + if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) 7188 + *per_cpu_ptr(sdd->sgp, cpu) = NULL; 7289 7189 } 7290 7190 7291 7191 #ifdef CONFIG_SCHED_SMT ··· 7310 7210 #endif 7311 7211 { sd_init_CPU, cpu_cpu_mask, }, 7312 7212 #ifdef CONFIG_NUMA 7313 - { sd_init_NODE, cpu_node_mask, }, 7213 + { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, 7314 7214 { sd_init_ALLNODES, cpu_allnodes_mask, }, 7315 7215 #endif 7316 7216 { NULL, }, ··· 7334 7234 if (!sdd->sg) 7335 7235 return -ENOMEM; 7336 7236 7237 + sdd->sgp = alloc_percpu(struct sched_group_power *); 7238 + if (!sdd->sgp) 7239 + return -ENOMEM; 7240 + 7337 7241 for_each_cpu(j, cpu_map) { 7338 7242 struct sched_domain *sd; 7339 7243 struct sched_group *sg; 7244 + struct sched_group_power *sgp; 7340 7245 7341 7246 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 7342 7247 GFP_KERNEL, cpu_to_node(j)); ··· 7356 7251 return -ENOMEM; 7357 7252 7358 7253 *per_cpu_ptr(sdd->sg, j) = sg; 7254 + 7255 + sgp = kzalloc_node(sizeof(struct sched_group_power), 7256 + GFP_KERNEL, cpu_to_node(j)); 7257 + if (!sgp) 7258 + return -ENOMEM; 7259 + 7260 + *per_cpu_ptr(sdd->sgp, j) = sgp; 7359 7261 } 7360 7262 } 7361 7263 ··· 7378 7266 struct sd_data *sdd = &tl->data; 7379 7267 7380 7268 for_each_cpu(j, cpu_map) { 7381 - kfree(*per_cpu_ptr(sdd->sd, j)); 7269 + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); 7270 + if (sd && (sd->flags & SD_OVERLAP)) 7271 + free_sched_groups(sd->groups, 0); 7382 7272 kfree(*per_cpu_ptr(sdd->sg, j)); 7273 + kfree(*per_cpu_ptr(sdd->sgp, j)); 7383 7274 } 7384 7275 free_percpu(sdd->sd); 7385 7276 free_percpu(sdd->sg); 7277 + free_percpu(sdd->sgp); 7386 7278 } 7387 7279 } 7388 7280 ··· 7432 7316 struct sched_domain_topology_level *tl; 7433 7317 7434 7318 sd = NULL; 7435 - for (tl = sched_domain_topology; tl->init; tl++) 7319 + for (tl = sched_domain_topology; tl->init; tl++) { 7436 7320 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); 7321 + if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) 7322 + sd->flags |= SD_OVERLAP; 7323 + if (cpumask_equal(cpu_map, sched_domain_span(sd))) 7324 + break; 7325 + } 7437 7326 7438 7327 while (sd->child) 7439 7328 sd = sd->child; ··· 7450 7329 for_each_cpu(i, cpu_map) { 7451 7330 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 7452 7331 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 7453 - get_group(i, sd->private, &sd->groups); 7454 - atomic_inc(&sd->groups->ref); 7455 - 7456 - if (i != cpumask_first(sched_domain_span(sd))) 7457 - continue; 7458 - 7459 - build_sched_groups(sd); 7332 + if (sd->flags & SD_OVERLAP) { 7333 + if (build_overlap_sched_groups(sd, i)) 7334 + goto error; 7335 + } else { 7336 + if (build_sched_groups(sd, i)) 7337 + goto error; 7338 + } 7460 7339 } 7461 7340 } 7462 7341

+23 -23

kernel/sched_fair.c

··· 1585 1585 } 1586 1586 1587 1587 /* Adjust by relative CPU power of the group */ 1588 - avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power; 1588 + avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; 1589 1589 1590 1590 if (local_group) { 1591 1591 this_load = avg_load; ··· 2631 2631 power >>= SCHED_POWER_SHIFT; 2632 2632 } 2633 2633 2634 - sdg->cpu_power_orig = power; 2634 + sdg->sgp->power_orig = power; 2635 2635 2636 2636 if (sched_feat(ARCH_POWER)) 2637 2637 power *= arch_scale_freq_power(sd, cpu); ··· 2647 2647 power = 1; 2648 2648 2649 2649 cpu_rq(cpu)->cpu_power = power; 2650 - sdg->cpu_power = power; 2650 + sdg->sgp->power = power; 2651 2651 } 2652 2652 2653 2653 static void update_group_power(struct sched_domain *sd, int cpu) ··· 2665 2665 2666 2666 group = child->groups; 2667 2667 do { 2668 - power += group->cpu_power; 2668 + power += group->sgp->power; 2669 2669 group = group->next; 2670 2670 } while (group != child->groups); 2671 2671 2672 - sdg->cpu_power = power; 2672 + sdg->sgp->power = power; 2673 2673 } 2674 2674 2675 2675 /* ··· 2691 2691 /* 2692 2692 * If ~90% of the cpu_power is still there, we're good. 2693 2693 */ 2694 - if (group->cpu_power * 32 > group->cpu_power_orig * 29) 2694 + if (group->sgp->power * 32 > group->sgp->power_orig * 29) 2695 2695 return 1; 2696 2696 2697 2697 return 0; ··· 2771 2771 } 2772 2772 2773 2773 /* Adjust by relative CPU power of the group */ 2774 - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power; 2774 + sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; 2775 2775 2776 2776 /* 2777 2777 * Consider the group unbalanced when the imbalance is larger ··· 2788 2788 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) 2789 2789 sgs->group_imb = 1; 2790 2790 2791 - sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, 2791 + sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, 2792 2792 SCHED_POWER_SCALE); 2793 2793 if (!sgs->group_capacity) 2794 2794 sgs->group_capacity = fix_small_capacity(sd, group); ··· 2877 2877 return; 2878 2878 2879 2879 sds->total_load += sgs.group_load; 2880 - sds->total_pwr += sg->cpu_power; 2880 + sds->total_pwr += sg->sgp->power; 2881 2881 2882 2882 /* 2883 2883 * In case the child domain prefers tasks go to siblings ··· 2962 2962 if (this_cpu > busiest_cpu) 2963 2963 return 0; 2964 2964 2965 - *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, 2965 + *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, 2966 2966 SCHED_POWER_SCALE); 2967 2967 return 1; 2968 2968 } ··· 2993 2993 2994 2994 scaled_busy_load_per_task = sds->busiest_load_per_task 2995 2995 * SCHED_POWER_SCALE; 2996 - scaled_busy_load_per_task /= sds->busiest->cpu_power; 2996 + scaled_busy_load_per_task /= sds->busiest->sgp->power; 2997 2997 2998 2998 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 2999 2999 (scaled_busy_load_per_task * imbn)) { ··· 3007 3007 * moving them. 3008 3008 */ 3009 3009 3010 - pwr_now += sds->busiest->cpu_power * 3010 + pwr_now += sds->busiest->sgp->power * 3011 3011 min(sds->busiest_load_per_task, sds->max_load); 3012 - pwr_now += sds->this->cpu_power * 3012 + pwr_now += sds->this->sgp->power * 3013 3013 min(sds->this_load_per_task, sds->this_load); 3014 3014 pwr_now /= SCHED_POWER_SCALE; 3015 3015 3016 3016 /* Amount of load we'd subtract */ 3017 3017 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / 3018 - sds->busiest->cpu_power; 3018 + sds->busiest->sgp->power; 3019 3019 if (sds->max_load > tmp) 3020 - pwr_move += sds->busiest->cpu_power * 3020 + pwr_move += sds->busiest->sgp->power * 3021 3021 min(sds->busiest_load_per_task, sds->max_load - tmp); 3022 3022 3023 3023 /* Amount of load we'd add */ 3024 - if (sds->max_load * sds->busiest->cpu_power < 3024 + if (sds->max_load * sds->busiest->sgp->power < 3025 3025 sds->busiest_load_per_task * SCHED_POWER_SCALE) 3026 - tmp = (sds->max_load * sds->busiest->cpu_power) / 3027 - sds->this->cpu_power; 3026 + tmp = (sds->max_load * sds->busiest->sgp->power) / 3027 + sds->this->sgp->power; 3028 3028 else 3029 3029 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / 3030 - sds->this->cpu_power; 3031 - pwr_move += sds->this->cpu_power * 3030 + sds->this->sgp->power; 3031 + pwr_move += sds->this->sgp->power * 3032 3032 min(sds->this_load_per_task, sds->this_load + tmp); 3033 3033 pwr_move /= SCHED_POWER_SCALE; 3034 3034 ··· 3074 3074 3075 3075 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); 3076 3076 3077 - load_above_capacity /= sds->busiest->cpu_power; 3077 + load_above_capacity /= sds->busiest->sgp->power; 3078 3078 } 3079 3079 3080 3080 /* ··· 3090 3090 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 3091 3091 3092 3092 /* How much load to actually move to equalise the imbalance */ 3093 - *imbalance = min(max_pull * sds->busiest->cpu_power, 3094 - (sds->avg_load - sds->this_load) * sds->this->cpu_power) 3093 + *imbalance = min(max_pull * sds->busiest->sgp->power, 3094 + (sds->avg_load - sds->this_load) * sds->this->sgp->power) 3095 3095 / SCHED_POWER_SCALE; 3096 3096 3097 3097 /*

+2

kernel/sched_features.h

··· 70 70 * using the scheduler IPI. Reduces rq->lock contention/bounces. 71 71 */ 72 72 SCHED_FEAT(TTWU_QUEUE, 1) 73 + 74 + SCHED_FEAT(FORCE_SD_OVERLAP, 0)

Configure Feed

Configure Feed