BKL: revert back to the old spinlock implementation

The generic semaphore rewrite had a huge performance regression on AIM7
(and potentially other BKL-heavy benchmarks) because the generic
semaphores had been rewritten to be simple to understand and fair. The
latter, in particular, turns a semaphore-based BKL implementation into a
mess of scheduling.

The attempt to fix the performance regression failed miserably (see the
previous commit 00b41ec2611dc98f87f30753ee00a53db648d662 'Revert
"semaphore: fix"'), and so for now the simple and sane approach is to
instead just go back to the old spinlock-based BKL implementation that
never had any issues like this.

This patch also has the advantage of being reported to fix the
regression completely according to Yanmin Zhang, unlike the semaphore
hack which still left a couple percentage point regression.

As a spinlock, the BKL obviously has the potential to be a latency
issue, but it's not really any different from any other spinlock in that
respect. We do want to get rid of the BKL asap, but that has been the
plan for several years.

These days, the biggest users are in the tty layer (open/release in
particular) and Alan holds out some hope:

"tty release is probably a few months away from getting cured - I'm
afraid it will almost certainly be the very last user of the BKL in
tty to get fixed as it depends on everything else being sanely locked."

so while we're not there yet, we do have a plan of action.

Tested-by: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Alexander Viro <viro@ftp.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Linus Torvalds 18 years ago 8e3e076c 00b41ec2

+98 -84

4 changed files

expand all

arch

mn10300

Kconfig

include

linux

hardirq.h

kernel

sched.c

lib

kernel_lock.c

-11

arch/mn10300/Kconfig

··· 186 186 Say Y here if you are building a kernel for a desktop, embedded 187 187 or real-time system. Say N if you are unsure. 188 188 189 - config PREEMPT_BKL 190 - bool "Preempt The Big Kernel Lock" 191 - depends on PREEMPT 192 - default y 193 - help 194 - This option reduces the latency of the kernel by making the 195 - big kernel lock preemptible. 196 - 197 - Say Y here if you are building a kernel for a desktop system. 198 - Say N if you are unsure. 199 - 200 189 config MN10300_CURRENT_IN_E2 201 190 bool "Hold current task address in E2 register" 202 191 default y

+10 -8

include/linux/hardirq.h

··· 72 72 #define in_softirq() (softirq_count()) 73 73 #define in_interrupt() (irq_count()) 74 74 75 + #if defined(CONFIG_PREEMPT) 76 + # define PREEMPT_INATOMIC_BASE kernel_locked() 77 + # define PREEMPT_CHECK_OFFSET 1 78 + #else 79 + # define PREEMPT_INATOMIC_BASE 0 80 + # define PREEMPT_CHECK_OFFSET 0 81 + #endif 82 + 75 83 /* 76 84 * Are we running in atomic context? WARNING: this macro cannot 77 85 * always detect atomic context; in particular, it cannot know about ··· 87 79 * used in the general case to determine whether sleeping is possible. 88 80 * Do not use in_atomic() in driver code. 89 81 */ 90 - #define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) 91 - 92 - #ifdef CONFIG_PREEMPT 93 - # define PREEMPT_CHECK_OFFSET 1 94 - #else 95 - # define PREEMPT_CHECK_OFFSET 0 96 - #endif 82 + #define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_INATOMIC_BASE) 97 83 98 84 /* 99 85 * Check whether we were atomic before we did preempt_disable(): 100 - * (used by the scheduler) 86 + * (used by the scheduler, *after* releasing the kernel lock) 101 87 */ 102 88 #define in_atomic_preempt_off() \ 103 89 ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)

+4 -23

kernel/sched.c

··· 4567 4567 asmlinkage void __sched preempt_schedule(void) 4568 4568 { 4569 4569 struct thread_info *ti = current_thread_info(); 4570 - struct task_struct *task = current; 4571 - int saved_lock_depth; 4572 4570 4573 4571 /* 4574 4572 * If there is a non-zero preempt_count or interrupts are disabled, ··· 4577 4579 4578 4580 do { 4579 4581 add_preempt_count(PREEMPT_ACTIVE); 4580 - 4581 - /* 4582 - * We keep the big kernel semaphore locked, but we 4583 - * clear ->lock_depth so that schedule() doesnt 4584 - * auto-release the semaphore: 4585 - */ 4586 - saved_lock_depth = task->lock_depth; 4587 - task->lock_depth = -1; 4588 4582 schedule(); 4589 - task->lock_depth = saved_lock_depth; 4590 4583 sub_preempt_count(PREEMPT_ACTIVE); 4591 4584 4592 4585 /* ··· 4598 4609 asmlinkage void __sched preempt_schedule_irq(void) 4599 4610 { 4600 4611 struct thread_info *ti = current_thread_info(); 4601 - struct task_struct *task = current; 4602 - int saved_lock_depth; 4603 4612 4604 4613 /* Catch callers which need to be fixed */ 4605 4614 BUG_ON(ti->preempt_count || !irqs_disabled()); 4606 4615 4607 4616 do { 4608 4617 add_preempt_count(PREEMPT_ACTIVE); 4609 - 4610 - /* 4611 - * We keep the big kernel semaphore locked, but we 4612 - * clear ->lock_depth so that schedule() doesnt 4613 - * auto-release the semaphore: 4614 - */ 4615 - saved_lock_depth = task->lock_depth; 4616 - task->lock_depth = -1; 4617 4618 local_irq_enable(); 4618 4619 schedule(); 4619 4620 local_irq_disable(); 4620 - task->lock_depth = saved_lock_depth; 4621 4621 sub_preempt_count(PREEMPT_ACTIVE); 4622 4622 4623 4623 /* ··· 5831 5853 spin_unlock_irqrestore(&rq->lock, flags); 5832 5854 5833 5855 /* Set the preempt count _outside_ the spinlocks! */ 5856 + #if defined(CONFIG_PREEMPT) 5857 + task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); 5858 + #else 5834 5859 task_thread_info(idle)->preempt_count = 0; 5835 - 5860 + #endif 5836 5861 /* 5837 5862 * The idle tasks have their own, simple scheduling class: 5838 5863 */

+84 -42

lib/kernel_lock.c

··· 11 11 #include <linux/semaphore.h> 12 12 13 13 /* 14 - * The 'big kernel semaphore' 14 + * The 'big kernel lock' 15 15 * 16 - * This mutex is taken and released recursively by lock_kernel() 16 + * This spinlock is taken and released recursively by lock_kernel() 17 17 * and unlock_kernel(). It is transparently dropped and reacquired 18 18 * over schedule(). It is used to protect legacy code that hasn't 19 19 * been migrated to a proper locking design yet. 20 20 * 21 - * Note: code locked by this semaphore will only be serialized against 22 - * other code using the same locking facility. The code guarantees that 23 - * the task remains on the same CPU. 24 - * 25 21 * Don't use in new code. 26 22 */ 27 - static DECLARE_MUTEX(kernel_sem); 23 + static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kernel_flag); 24 + 28 25 29 26 /* 30 - * Re-acquire the kernel semaphore. 27 + * Acquire/release the underlying lock from the scheduler. 31 28 * 32 - * This function is called with preemption off. 29 + * This is called with preemption disabled, and should 30 + * return an error value if it cannot get the lock and 31 + * TIF_NEED_RESCHED gets set. 33 32 * 34 - * We are executing in schedule() so the code must be extremely careful 35 - * about recursion, both due to the down() and due to the enabling of 36 - * preemption. schedule() will re-check the preemption flag after 37 - * reacquiring the semaphore. 33 + * If it successfully gets the lock, it should increment 34 + * the preemption count like any spinlock does. 35 + * 36 + * (This works on UP too - _raw_spin_trylock will never 37 + * return false in that case) 38 38 */ 39 39 int __lockfunc __reacquire_kernel_lock(void) 40 40 { 41 - struct task_struct *task = current; 42 - int saved_lock_depth = task->lock_depth; 43 - 44 - BUG_ON(saved_lock_depth < 0); 45 - 46 - task->lock_depth = -1; 47 - preempt_enable_no_resched(); 48 - 49 - down(&kernel_sem); 50 - 41 + while (!_raw_spin_trylock(&kernel_flag)) { 42 + if (test_thread_flag(TIF_NEED_RESCHED)) 43 + return -EAGAIN; 44 + cpu_relax(); 45 + } 51 46 preempt_disable(); 52 - task->lock_depth = saved_lock_depth; 53 - 54 47 return 0; 55 48 } 56 49 57 50 void __lockfunc __release_kernel_lock(void) 58 51 { 59 - up(&kernel_sem); 52 + _raw_spin_unlock(&kernel_flag); 53 + preempt_enable_no_resched(); 60 54 } 61 55 62 56 /* 63 - * Getting the big kernel semaphore. 57 + * These are the BKL spinlocks - we try to be polite about preemption. 58 + * If SMP is not on (ie UP preemption), this all goes away because the 59 + * _raw_spin_trylock() will always succeed. 60 + */ 61 + #ifdef CONFIG_PREEMPT 62 + static inline void __lock_kernel(void) 63 + { 64 + preempt_disable(); 65 + if (unlikely(!_raw_spin_trylock(&kernel_flag))) { 66 + /* 67 + * If preemption was disabled even before this 68 + * was called, there's nothing we can be polite 69 + * about - just spin. 70 + */ 71 + if (preempt_count() > 1) { 72 + _raw_spin_lock(&kernel_flag); 73 + return; 74 + } 75 + 76 + /* 77 + * Otherwise, let's wait for the kernel lock 78 + * with preemption enabled.. 79 + */ 80 + do { 81 + preempt_enable(); 82 + while (spin_is_locked(&kernel_flag)) 83 + cpu_relax(); 84 + preempt_disable(); 85 + } while (!_raw_spin_trylock(&kernel_flag)); 86 + } 87 + } 88 + 89 + #else 90 + 91 + /* 92 + * Non-preemption case - just get the spinlock 93 + */ 94 + static inline void __lock_kernel(void) 95 + { 96 + _raw_spin_lock(&kernel_flag); 97 + } 98 + #endif 99 + 100 + static inline void __unlock_kernel(void) 101 + { 102 + /* 103 + * the BKL is not covered by lockdep, so we open-code the 104 + * unlocking sequence (and thus avoid the dep-chain ops): 105 + */ 106 + _raw_spin_unlock(&kernel_flag); 107 + preempt_enable(); 108 + } 109 + 110 + /* 111 + * Getting the big kernel lock. 112 + * 113 + * This cannot happen asynchronously, so we only need to 114 + * worry about other CPU's. 64 115 */ 65 116 void __lockfunc lock_kernel(void) 66 117 { 67 - struct task_struct *task = current; 68 - int depth = task->lock_depth + 1; 69 - 118 + int depth = current->lock_depth+1; 70 119 if (likely(!depth)) 71 - /* 72 - * No recursion worries - we set up lock_depth _after_ 73 - */ 74 - down(&kernel_sem); 75 - 76 - task->lock_depth = depth; 120 + __lock_kernel(); 121 + current->lock_depth = depth; 77 122 } 78 123 79 124 void __lockfunc unlock_kernel(void) 80 125 { 81 - struct task_struct *task = current; 82 - 83 - BUG_ON(task->lock_depth < 0); 84 - 85 - if (likely(--task->lock_depth < 0)) 86 - up(&kernel_sem); 126 + BUG_ON(current->lock_depth < 0); 127 + if (likely(--current->lock_depth < 0)) 128 + __unlock_kernel(); 87 129 } 88 130 89 131 EXPORT_SYMBOL(lock_kernel);

Configure Feed

Configure Feed