Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'core-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull RCU fixes from Thomas Gleixner:
"Two RCU patches:
- Address a serious performance regression on open/close caused by
commit ac1bea85781e ("Make cond_resched() report RCU quiescent
states")
- Export RCU debug functions. Not a regression, but enablement to
address a serious recursion bug in the sl*b allocators in 3.17"

* 'core-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
rcu: Reduce overhead of cond_resched() checks for RCU
rcu: Export debug_init_rcu_head() and and debug_init_rcu_head()

+137 -92
+6
Documentation/kernel-parameters.txt
··· 2790 2790 leaf rcu_node structure. Useful for very large 2791 2791 systems. 2792 2792 2793 + rcutree.jiffies_till_sched_qs= [KNL] 2794 + Set required age in jiffies for a 2795 + given grace period before RCU starts 2796 + soliciting quiescent-state help from 2797 + rcu_note_context_switch(). 2798 + 2793 2799 rcutree.jiffies_till_first_fqs= [KNL] 2794 2800 Set delay from grace-period initialization to 2795 2801 first attempt to force quiescent states.
+10 -36
include/linux/rcupdate.h
··· 44 44 #include <linux/debugobjects.h> 45 45 #include <linux/bug.h> 46 46 #include <linux/compiler.h> 47 - #include <linux/percpu.h> 48 47 #include <asm/barrier.h> 49 48 50 49 extern int rcu_expedited; /* for sysctl */ ··· 299 300 #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */ 300 301 301 302 /* 302 - * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings. 303 - */ 304 - 305 - #define RCU_COND_RESCHED_LIM 256 /* ms vs. 100s of ms. */ 306 - DECLARE_PER_CPU(int, rcu_cond_resched_count); 307 - void rcu_resched(void); 308 - 309 - /* 310 - * Is it time to report RCU quiescent states? 311 - * 312 - * Note unsynchronized access to rcu_cond_resched_count. Yes, we might 313 - * increment some random CPU's count, and possibly also load the result from 314 - * yet another CPU's count. We might even clobber some other CPU's attempt 315 - * to zero its counter. This is all OK because the goal is not precision, 316 - * but rather reasonable amortization of rcu_note_context_switch() overhead 317 - * and extremely high probability of avoiding RCU CPU stall warnings. 318 - * Note that this function has to be preempted in just the wrong place, 319 - * many thousands of times in a row, for anything bad to happen. 320 - */ 321 - static inline bool rcu_should_resched(void) 322 - { 323 - return raw_cpu_inc_return(rcu_cond_resched_count) >= 324 - RCU_COND_RESCHED_LIM; 325 - } 326 - 327 - /* 328 - * Report quiscent states to RCU if it is time to do so. 329 - */ 330 - static inline void rcu_cond_resched(void) 331 - { 332 - if (unlikely(rcu_should_resched())) 333 - rcu_resched(); 334 - } 335 - 336 - /* 337 303 * Infrastructure to implement the synchronize_() primitives in 338 304 * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. 339 305 */ ··· 322 358 * initialization. 323 359 */ 324 360 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD 361 + void init_rcu_head(struct rcu_head *head); 362 + void destroy_rcu_head(struct rcu_head *head); 325 363 void init_rcu_head_on_stack(struct rcu_head *head); 326 364 void destroy_rcu_head_on_stack(struct rcu_head *head); 327 365 #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 366 + static inline void init_rcu_head(struct rcu_head *head) 367 + { 368 + } 369 + 370 + static inline void destroy_rcu_head(struct rcu_head *head) 371 + { 372 + } 373 + 328 374 static inline void init_rcu_head_on_stack(struct rcu_head *head) 329 375 { 330 376 }
+112 -28
kernel/rcu/tree.c
··· 206 206 rdp->passed_quiesce = 1; 207 207 } 208 208 209 + static DEFINE_PER_CPU(int, rcu_sched_qs_mask); 210 + 211 + static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 212 + .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 213 + .dynticks = ATOMIC_INIT(1), 214 + #ifdef CONFIG_NO_HZ_FULL_SYSIDLE 215 + .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, 216 + .dynticks_idle = ATOMIC_INIT(1), 217 + #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 218 + }; 219 + 220 + /* 221 + * Let the RCU core know that this CPU has gone through the scheduler, 222 + * which is a quiescent state. This is called when the need for a 223 + * quiescent state is urgent, so we burn an atomic operation and full 224 + * memory barriers to let the RCU core know about it, regardless of what 225 + * this CPU might (or might not) do in the near future. 226 + * 227 + * We inform the RCU core by emulating a zero-duration dyntick-idle 228 + * period, which we in turn do by incrementing the ->dynticks counter 229 + * by two. 230 + */ 231 + static void rcu_momentary_dyntick_idle(void) 232 + { 233 + unsigned long flags; 234 + struct rcu_data *rdp; 235 + struct rcu_dynticks *rdtp; 236 + int resched_mask; 237 + struct rcu_state *rsp; 238 + 239 + local_irq_save(flags); 240 + 241 + /* 242 + * Yes, we can lose flag-setting operations. This is OK, because 243 + * the flag will be set again after some delay. 244 + */ 245 + resched_mask = raw_cpu_read(rcu_sched_qs_mask); 246 + raw_cpu_write(rcu_sched_qs_mask, 0); 247 + 248 + /* Find the flavor that needs a quiescent state. */ 249 + for_each_rcu_flavor(rsp) { 250 + rdp = raw_cpu_ptr(rsp->rda); 251 + if (!(resched_mask & rsp->flavor_mask)) 252 + continue; 253 + smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ 254 + if (ACCESS_ONCE(rdp->mynode->completed) != 255 + ACCESS_ONCE(rdp->cond_resched_completed)) 256 + continue; 257 + 258 + /* 259 + * Pretend to be momentarily idle for the quiescent state. 260 + * This allows the grace-period kthread to record the 261 + * quiescent state, with no need for this CPU to do anything 262 + * further. 263 + */ 264 + rdtp = this_cpu_ptr(&rcu_dynticks); 265 + smp_mb__before_atomic(); /* Earlier stuff before QS. */ 266 + atomic_add(2, &rdtp->dynticks); /* QS. */ 267 + smp_mb__after_atomic(); /* Later stuff after QS. */ 268 + break; 269 + } 270 + local_irq_restore(flags); 271 + } 272 + 209 273 /* 210 274 * Note a context switch. This is a quiescent state for RCU-sched, 211 275 * and requires special handling for preemptible RCU. ··· 280 216 trace_rcu_utilization(TPS("Start context switch")); 281 217 rcu_sched_qs(cpu); 282 218 rcu_preempt_note_context_switch(cpu); 219 + if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) 220 + rcu_momentary_dyntick_idle(); 283 221 trace_rcu_utilization(TPS("End context switch")); 284 222 } 285 223 EXPORT_SYMBOL_GPL(rcu_note_context_switch); 286 - 287 - static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 288 - .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 289 - .dynticks = ATOMIC_INIT(1), 290 - #ifdef CONFIG_NO_HZ_FULL_SYSIDLE 291 - .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, 292 - .dynticks_idle = ATOMIC_INIT(1), 293 - #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 294 - }; 295 224 296 225 static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 297 226 static long qhimark = 10000; /* If this many pending, ignore blimit. */ ··· 299 242 300 243 module_param(jiffies_till_first_fqs, ulong, 0644); 301 244 module_param(jiffies_till_next_fqs, ulong, 0644); 245 + 246 + /* 247 + * How long the grace period must be before we start recruiting 248 + * quiescent-state help from rcu_note_context_switch(). 249 + */ 250 + static ulong jiffies_till_sched_qs = HZ / 20; 251 + module_param(jiffies_till_sched_qs, ulong, 0644); 302 252 303 253 static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 304 254 struct rcu_data *rdp); ··· 917 853 bool *isidle, unsigned long *maxj) 918 854 { 919 855 unsigned int curr; 856 + int *rcrmp; 920 857 unsigned int snap; 921 858 922 859 curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); ··· 958 893 } 959 894 960 895 /* 961 - * There is a possibility that a CPU in adaptive-ticks state 962 - * might run in the kernel with the scheduling-clock tick disabled 963 - * for an extended time period. Invoke rcu_kick_nohz_cpu() to 964 - * force the CPU to restart the scheduling-clock tick in this 965 - * CPU is in this state. 896 + * A CPU running for an extended time within the kernel can 897 + * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode, 898 + * even context-switching back and forth between a pair of 899 + * in-kernel CPU-bound tasks cannot advance grace periods. 900 + * So if the grace period is old enough, make the CPU pay attention. 901 + * Note that the unsynchronized assignments to the per-CPU 902 + * rcu_sched_qs_mask variable are safe. Yes, setting of 903 + * bits can be lost, but they will be set again on the next 904 + * force-quiescent-state pass. So lost bit sets do not result 905 + * in incorrect behavior, merely in a grace period lasting 906 + * a few jiffies longer than it might otherwise. Because 907 + * there are at most four threads involved, and because the 908 + * updates are only once every few jiffies, the probability of 909 + * lossage (and thus of slight grace-period extension) is 910 + * quite low. 911 + * 912 + * Note that if the jiffies_till_sched_qs boot/sysfs parameter 913 + * is set too high, we override with half of the RCU CPU stall 914 + * warning delay. 966 915 */ 967 - rcu_kick_nohz_cpu(rdp->cpu); 968 - 969 - /* 970 - * Alternatively, the CPU might be running in the kernel 971 - * for an extended period of time without a quiescent state. 972 - * Attempt to force the CPU through the scheduler to gain the 973 - * needed quiescent state, but only if the grace period has gone 974 - * on for an uncommonly long time. If there are many stuck CPUs, 975 - * we will beat on the first one until it gets unstuck, then move 976 - * to the next. Only do this for the primary flavor of RCU. 977 - */ 978 - if (rdp->rsp == rcu_state_p && 916 + rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); 917 + if (ULONG_CMP_GE(jiffies, 918 + rdp->rsp->gp_start + jiffies_till_sched_qs) || 979 919 ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { 980 - rdp->rsp->jiffies_resched += 5; 981 - resched_cpu(rdp->cpu); 920 + if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { 921 + ACCESS_ONCE(rdp->cond_resched_completed) = 922 + ACCESS_ONCE(rdp->mynode->completed); 923 + smp_mb(); /* ->cond_resched_completed before *rcrmp. */ 924 + ACCESS_ONCE(*rcrmp) = 925 + ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask; 926 + resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ 927 + rdp->rsp->jiffies_resched += 5; /* Enable beating. */ 928 + } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { 929 + /* Time to beat on that CPU again! */ 930 + resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ 931 + rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ 932 + } 982 933 } 983 934 984 935 return 0; ··· 3572 3491 "rcu_node_fqs_1", 3573 3492 "rcu_node_fqs_2", 3574 3493 "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ 3494 + static u8 fl_mask = 0x1; 3575 3495 int cpustride = 1; 3576 3496 int i; 3577 3497 int j; ··· 3591 3509 for (i = 1; i < rcu_num_lvls; i++) 3592 3510 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; 3593 3511 rcu_init_levelspread(rsp); 3512 + rsp->flavor_mask = fl_mask; 3513 + fl_mask <<= 1; 3594 3514 3595 3515 /* Initialize the elements themselves, starting from the leaves. */ 3596 3516
+5 -1
kernel/rcu/tree.h
··· 307 307 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 308 308 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ 309 309 unsigned long offline_fqs; /* Kicked due to being offline. */ 310 + unsigned long cond_resched_completed; 311 + /* Grace period that needs help */ 312 + /* from cond_resched(). */ 310 313 311 314 /* 5) __rcu_pending() statistics. */ 312 315 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ ··· 395 392 struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ 396 393 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ 397 394 u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ 395 + u8 flavor_mask; /* bit in flavor mask. */ 398 396 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 399 397 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ 400 398 void (*func)(struct rcu_head *head)); ··· 567 563 static void do_nocb_deferred_wakeup(struct rcu_data *rdp); 568 564 static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); 569 565 static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); 570 - static void rcu_kick_nohz_cpu(int cpu); 566 + static void __maybe_unused rcu_kick_nohz_cpu(int cpu); 571 567 static bool init_nocb_callback_list(struct rcu_data *rdp); 572 568 static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); 573 569 static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
+1 -1
kernel/rcu/tree_plugin.h
··· 2404 2404 * if an adaptive-ticks CPU is failing to respond to the current grace 2405 2405 * period and has not be idle from an RCU perspective, kick it. 2406 2406 */ 2407 - static void rcu_kick_nohz_cpu(int cpu) 2407 + static void __maybe_unused rcu_kick_nohz_cpu(int cpu) 2408 2408 { 2409 2409 #ifdef CONFIG_NO_HZ_FULL 2410 2410 if (tick_nohz_full_cpu(cpu))
+2 -20
kernel/rcu/update.c
··· 200 200 EXPORT_SYMBOL_GPL(wait_rcu_gp); 201 201 202 202 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD 203 - static inline void debug_init_rcu_head(struct rcu_head *head) 203 + void init_rcu_head(struct rcu_head *head) 204 204 { 205 205 debug_object_init(head, &rcuhead_debug_descr); 206 206 } 207 207 208 - static inline void debug_rcu_head_free(struct rcu_head *head) 208 + void destroy_rcu_head(struct rcu_head *head) 209 209 { 210 210 debug_object_free(head, &rcuhead_debug_descr); 211 211 } ··· 350 350 early_initcall(check_cpu_stall_init); 351 351 352 352 #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ 353 - 354 - /* 355 - * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings. 356 - */ 357 - 358 - DEFINE_PER_CPU(int, rcu_cond_resched_count); 359 - 360 - /* 361 - * Report a set of RCU quiescent states, for use by cond_resched() 362 - * and friends. Out of line due to being called infrequently. 363 - */ 364 - void rcu_resched(void) 365 - { 366 - preempt_disable(); 367 - __this_cpu_write(rcu_cond_resched_count, 0); 368 - rcu_note_context_switch(smp_processor_id()); 369 - preempt_enable(); 370 - }
+1 -6
kernel/sched/core.c
··· 4147 4147 4148 4148 int __sched _cond_resched(void) 4149 4149 { 4150 - rcu_cond_resched(); 4151 4150 if (should_resched()) { 4152 4151 __cond_resched(); 4153 4152 return 1; ··· 4165 4166 */ 4166 4167 int __cond_resched_lock(spinlock_t *lock) 4167 4168 { 4168 - bool need_rcu_resched = rcu_should_resched(); 4169 4169 int resched = should_resched(); 4170 4170 int ret = 0; 4171 4171 4172 4172 lockdep_assert_held(lock); 4173 4173 4174 - if (spin_needbreak(lock) || resched || need_rcu_resched) { 4174 + if (spin_needbreak(lock) || resched) { 4175 4175 spin_unlock(lock); 4176 4176 if (resched) 4177 4177 __cond_resched(); 4178 - else if (unlikely(need_rcu_resched)) 4179 - rcu_resched(); 4180 4178 else 4181 4179 cpu_relax(); 4182 4180 ret = 1; ··· 4187 4191 { 4188 4192 BUG_ON(!in_softirq()); 4189 4193 4190 - rcu_cond_resched(); /* BH disabled OK, just recording QSes. */ 4191 4194 if (should_resched()) { 4192 4195 local_bh_enable(); 4193 4196 __cond_resched();