Merge git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched

+2 -1

include/linux/sched.h

··· 1399 1399 1400 1400 extern void sched_idle_next(void); 1401 1401 1402 - extern unsigned int sysctl_sched_granularity; 1402 + extern unsigned int sysctl_sched_latency; 1403 + extern unsigned int sysctl_sched_min_granularity; 1403 1404 extern unsigned int sysctl_sched_wakeup_granularity; 1404 1405 extern unsigned int sysctl_sched_batch_wakeup_granularity; 1405 1406 extern unsigned int sysctl_sched_stat_granularity;

+10 -6

kernel/sched.c

··· 4911 4911 static inline void sched_init_granularity(void) 4912 4912 { 4913 4913 unsigned int factor = 1 + ilog2(num_online_cpus()); 4914 - const unsigned long gran_limit = 100000000; 4914 + const unsigned long limit = 100000000; 4915 4915 4916 - sysctl_sched_granularity *= factor; 4917 - if (sysctl_sched_granularity > gran_limit) 4918 - sysctl_sched_granularity = gran_limit; 4916 + sysctl_sched_min_granularity *= factor; 4917 + if (sysctl_sched_min_granularity > limit) 4918 + sysctl_sched_min_granularity = limit; 4919 4919 4920 - sysctl_sched_runtime_limit = sysctl_sched_granularity * 5; 4921 - sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; 4920 + sysctl_sched_latency *= factor; 4921 + if (sysctl_sched_latency > limit) 4922 + sysctl_sched_latency = limit; 4923 + 4924 + sysctl_sched_runtime_limit = sysctl_sched_latency * 5; 4925 + sysctl_sched_wakeup_granularity = sysctl_sched_latency / 2; 4922 4926 } 4923 4927 4924 4928 #ifdef CONFIG_SMP

+65 -12

kernel/sched_fair.c

··· 15 15 * 16 16 * Scaled math optimizations by Thomas Gleixner 17 17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> 18 + * 19 + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra 20 + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 18 21 */ 19 22 20 23 /* 21 - * Preemption granularity: 22 - * (default: 10 msec, units: nanoseconds) 24 + * Targeted preemption latency for CPU-bound tasks: 25 + * (default: 20ms, units: nanoseconds) 23 26 * 24 - * NOTE: this granularity value is not the same as the concept of 25 - * 'timeslice length' - timeslices in CFS will typically be somewhat 26 - * larger than this value. (to see the precise effective timeslice 27 - * length of your workload, run vmstat and monitor the context-switches 28 - * field) 27 + * NOTE: this latency value is not the same as the concept of 28 + * 'timeslice length' - timeslices in CFS are of variable length. 29 + * (to see the precise effective timeslice length of your workload, 30 + * run vmstat and monitor the context-switches field) 29 31 * 30 32 * On SMP systems the value of this is multiplied by the log2 of the 31 33 * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way 32 34 * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) 35 + * Targeted preemption latency for CPU-bound tasks: 33 36 */ 34 - unsigned int sysctl_sched_granularity __read_mostly = 10000000UL; 37 + unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; 38 + 39 + /* 40 + * Minimal preemption granularity for CPU-bound tasks: 41 + * (default: 2 msec, units: nanoseconds) 42 + */ 43 + unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; 35 44 36 45 /* 37 46 * SCHED_BATCH wake-up granularity. ··· 222 213 */ 223 214 224 215 /* 216 + * Calculate the preemption granularity needed to schedule every 217 + * runnable task once per sysctl_sched_latency amount of time. 218 + * (down to a sensible low limit on granularity) 219 + * 220 + * For example, if there are 2 tasks running and latency is 10 msecs, 221 + * we switch tasks every 5 msecs. If we have 3 tasks running, we have 222 + * to switch tasks every 3.33 msecs to get a 10 msecs observed latency 223 + * for each task. We do finer and finer scheduling up to until we 224 + * reach the minimum granularity value. 225 + * 226 + * To achieve this we use the following dynamic-granularity rule: 227 + * 228 + * gran = lat/nr - lat/nr/nr 229 + * 230 + * This comes out of the following equations: 231 + * 232 + * kA1 + gran = kB1 233 + * kB2 + gran = kA2 234 + * kA2 = kA1 235 + * kB2 = kB1 - d + d/nr 236 + * lat = d * nr 237 + * 238 + * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running), 239 + * '1' is start of time, '2' is end of time, 'd' is delay between 240 + * 1 and 2 (during which task B was running), 'nr' is number of tasks 241 + * running, 'lat' is the the period of each task. ('lat' is the 242 + * sched_latency that we aim for.) 243 + */ 244 + static long 245 + sched_granularity(struct cfs_rq *cfs_rq) 246 + { 247 + unsigned int gran = sysctl_sched_latency; 248 + unsigned int nr = cfs_rq->nr_running; 249 + 250 + if (nr > 1) { 251 + gran = gran/nr - gran/nr/nr; 252 + gran = max(gran, sysctl_sched_min_granularity); 253 + } 254 + 255 + return gran; 256 + } 257 + 258 + /* 225 259 * We rescale the rescheduling granularity of tasks according to their 226 260 * nice level, but only linearly, not exponentially: 227 261 */ ··· 354 302 delta_fair = calc_delta_fair(delta_exec, lw); 355 303 delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); 356 304 357 - if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) { 305 + if (cfs_rq->sleeper_bonus > sysctl_sched_latency) { 358 306 delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); 359 307 delta = min(delta, (unsigned long)( 360 308 (long)sysctl_sched_runtime_limit - curr->wait_runtime)); ··· 741 689 if (next == curr) 742 690 return; 743 691 744 - __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); 692 + __check_preempt_curr_fair(cfs_rq, next, curr, 693 + sched_granularity(cfs_rq)); 745 694 } 746 695 747 696 /************************************************** ··· 1087 1034 * it will preempt the parent: 1088 1035 */ 1089 1036 p->se.fair_key = current->se.fair_key - 1090 - niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; 1037 + niced_granularity(&rq->curr->se, sched_granularity(cfs_rq)) - 1; 1091 1038 /* 1092 1039 * The first wait is dominated by the child-runs-first logic, 1093 1040 * so do not credit it with that waiting time yet: ··· 1100 1047 * -granularity/2, so initialize the task with that: 1101 1048 */ 1102 1049 if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) 1103 - p->se.wait_runtime = -((long)sysctl_sched_granularity / 2); 1050 + p->se.wait_runtime = -(sched_granularity(cfs_rq) / 2); 1104 1051 1105 1052 __enqueue_entity(cfs_rq, se); 1106 1053 }

+22 -11

kernel/sysctl.c

··· 222 222 #ifdef CONFIG_SCHED_DEBUG 223 223 { 224 224 .ctl_name = CTL_UNNUMBERED, 225 - .procname = "sched_granularity_ns", 226 - .data = &sysctl_sched_granularity, 225 + .procname = "sched_min_granularity_ns", 226 + .data = &sysctl_sched_min_granularity, 227 + .maxlen = sizeof(unsigned int), 228 + .mode = 0644, 229 + .proc_handler = &proc_dointvec_minmax, 230 + .strategy = &sysctl_intvec, 231 + .extra1 = &min_sched_granularity_ns, 232 + .extra2 = &max_sched_granularity_ns, 233 + }, 234 + { 235 + .ctl_name = CTL_UNNUMBERED, 236 + .procname = "sched_latency_ns", 237 + .data = &sysctl_sched_latency, 227 238 .maxlen = sizeof(unsigned int), 228 239 .mode = 0644, 229 240 .proc_handler = &proc_dointvec_minmax, ··· 294 283 .mode = 0644, 295 284 .proc_handler = &proc_dointvec, 296 285 }, 286 + { 287 + .ctl_name = CTL_UNNUMBERED, 288 + .procname = "sched_features", 289 + .data = &sysctl_sched_features, 290 + .maxlen = sizeof(unsigned int), 291 + .mode = 0644, 292 + .proc_handler = &proc_dointvec, 293 + }, 294 + #endif 297 295 #ifdef CONFIG_PROVE_LOCKING 298 296 { 299 297 .ctl_name = CTL_UNNUMBERED, ··· 319 299 .procname = "lock_stat", 320 300 .data = &lock_stat, 321 301 .maxlen = sizeof(int), 322 - .mode = 0644, 323 - .proc_handler = &proc_dointvec, 324 - }, 325 - #endif 326 - { 327 - .ctl_name = CTL_UNNUMBERED, 328 - .procname = "sched_features", 329 - .data = &sysctl_sched_features, 330 - .maxlen = sizeof(unsigned int), 331 302 .mode = 0644, 332 303 .proc_handler = &proc_dointvec, 333 304 },

Configure Feed

Configure Feed