Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'sched-rt-2022-10-05' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull preempt RT updates from Thomas Gleixner:
"Introduce preempt_[dis|enable_nested() and use it to clean up various
places which have open coded PREEMPT_RT conditionals.

On PREEMPT_RT enabled kernels, spinlocks and rwlocks are neither
disabling preemption nor interrupts. Though there are a few places
which depend on the implicit preemption/interrupt disable of those
locks, e.g. seqcount write sections, per CPU statistics updates etc.

PREEMPT_RT added open coded CONFIG_PREEMPT_RT conditionals to
disable/enable preemption in the related code parts all over the
place. That's hard to read and does not really explain why this is
necessary.

Linus suggested to use helper functions (preempt_disable_nested() and
preempt_enable_nested()) and use those in the affected places. On !RT
enabled kernels these functions are NOPs, but contain a lockdep assert
to validate that preemption is actually disabled to catch call sites
which do not have preemption disabled.

Clean up the affected code paths in mm, dentry and lib"

* tag 'sched-rt-2022-10-05' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
u64_stats: Streamline the implementation
flex_proportions: Disable preemption entering the write section.
mm/compaction: Get rid of RT ifdeffery
mm/memcontrol: Replace the PREEMPT_RT conditionals
mm/debug: Provide VM_WARN_ON_IRQS_ENABLED()
mm/vmstat: Use preempt_[dis|en]able_nested()
dentry: Use preempt_[dis|en]able_nested()
preempt: Provide preempt_[dis|en]able_nested()

+145 -135
+2 -11
fs/dcache.c
··· 2597 2597 2598 2598 static inline unsigned start_dir_add(struct inode *dir) 2599 2599 { 2600 - /* 2601 - * The caller holds a spinlock (dentry::d_lock). On !PREEMPT_RT 2602 - * kernels spin_lock() implicitly disables preemption, but not on 2603 - * PREEMPT_RT. So for RT it has to be done explicitly to protect 2604 - * the sequence count write side critical section against a reader 2605 - * or another writer preempting, which would result in a live lock. 2606 - */ 2607 - if (IS_ENABLED(CONFIG_PREEMPT_RT)) 2608 - preempt_disable(); 2600 + preempt_disable_nested(); 2609 2601 for (;;) { 2610 2602 unsigned n = dir->i_dir_seq; 2611 2603 if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) ··· 2610 2618 wait_queue_head_t *d_wait) 2611 2619 { 2612 2620 smp_store_release(&dir->i_dir_seq, n + 2); 2613 - if (IS_ENABLED(CONFIG_PREEMPT_RT)) 2614 - preempt_enable(); 2621 + preempt_enable_nested(); 2615 2622 wake_up_all(d_wait); 2616 2623 } 2617 2624
+6
include/linux/mmdebug.h
··· 94 94 #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) 95 95 #endif 96 96 97 + #ifdef CONFIG_DEBUG_VM_IRQSOFF 98 + #define VM_WARN_ON_IRQS_ENABLED() WARN_ON_ONCE(!irqs_disabled()) 99 + #else 100 + #define VM_WARN_ON_IRQS_ENABLED() do { } while (0) 101 + #endif 102 + 97 103 #ifdef CONFIG_DEBUG_VIRTUAL 98 104 #define VIRTUAL_BUG_ON(cond) BUG_ON(cond) 99 105 #else
+42
include/linux/preempt.h
··· 421 421 422 422 #endif /* CONFIG_SMP */ 423 423 424 + /** 425 + * preempt_disable_nested - Disable preemption inside a normally preempt disabled section 426 + * 427 + * Use for code which requires preemption protection inside a critical 428 + * section which has preemption disabled implicitly on non-PREEMPT_RT 429 + * enabled kernels, by e.g.: 430 + * - holding a spinlock/rwlock 431 + * - soft interrupt context 432 + * - regular interrupt handlers 433 + * 434 + * On PREEMPT_RT enabled kernels spinlock/rwlock held sections, soft 435 + * interrupt context and regular interrupt handlers are preemptible and 436 + * only prevent migration. preempt_disable_nested() ensures that preemption 437 + * is disabled for cases which require CPU local serialization even on 438 + * PREEMPT_RT. For non-PREEMPT_RT kernels this is a NOP. 439 + * 440 + * The use cases are code sequences which are not serialized by a 441 + * particular lock instance, e.g.: 442 + * - seqcount write side critical sections where the seqcount is not 443 + * associated to a particular lock and therefore the automatic 444 + * protection mechanism does not work. This prevents a live lock 445 + * against a preempting high priority reader. 446 + * - RMW per CPU variable updates like vmstat. 447 + */ 448 + /* Macro to avoid header recursion hell vs. lockdep */ 449 + #define preempt_disable_nested() \ 450 + do { \ 451 + if (IS_ENABLED(CONFIG_PREEMPT_RT)) \ 452 + preempt_disable(); \ 453 + else \ 454 + lockdep_assert_preemption_disabled(); \ 455 + } while (0) 456 + 457 + /** 458 + * preempt_enable_nested - Undo the effect of preempt_disable_nested() 459 + */ 460 + static __always_inline void preempt_enable_nested(void) 461 + { 462 + if (IS_ENABLED(CONFIG_PREEMPT_RT)) 463 + preempt_enable(); 464 + } 465 + 424 466 #endif /* __LINUX_PREEMPT_H */
+65 -82
include/linux/u64_stats_sync.h
··· 8 8 * 9 9 * Key points : 10 10 * 11 - * - Use a seqcount on 32-bit SMP, only disable preemption for 32-bit UP. 11 + * - Use a seqcount on 32-bit 12 12 * - The whole thing is a no-op on 64-bit architectures. 13 13 * 14 14 * Usage constraints: ··· 20 20 * writer and also spin forever. 21 21 * 22 22 * 3) Write side must use the _irqsave() variant if other writers, or a reader, 23 - * can be invoked from an IRQ context. 23 + * can be invoked from an IRQ context. On 64bit systems this variant does not 24 + * disable interrupts. 24 25 * 25 26 * 4) If reader fetches several counters, there is no guarantee the whole values 26 27 * are consistent w.r.t. each other (remember point #2: seqcounts are not ··· 29 28 * 30 29 * 5) Readers are allowed to sleep or be preempted/interrupted: they perform 31 30 * pure reads. 32 - * 33 - * 6) Readers must use both u64_stats_fetch_{begin,retry}_irq() if the stats 34 - * might be updated from a hardirq or softirq context (remember point #1: 35 - * seqcounts are not used for UP kernels). 32-bit UP stat readers could read 36 - * corrupted 64-bit values otherwise. 37 31 * 38 32 * Usage : 39 33 * ··· 62 66 #include <linux/seqlock.h> 63 67 64 68 struct u64_stats_sync { 65 - #if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) 69 + #if BITS_PER_LONG == 32 66 70 seqcount_t seq; 67 71 #endif 68 72 }; ··· 94 98 local64_inc(&p->v); 95 99 } 96 100 97 - #else 101 + static inline void u64_stats_init(struct u64_stats_sync *syncp) { } 102 + static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp) { } 103 + static inline void __u64_stats_update_end(struct u64_stats_sync *syncp) { } 104 + static inline unsigned long __u64_stats_irqsave(void) { return 0; } 105 + static inline void __u64_stats_irqrestore(unsigned long flags) { } 106 + static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) 107 + { 108 + return 0; 109 + } 110 + static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, 111 + unsigned int start) 112 + { 113 + return false; 114 + } 115 + 116 + #else /* 64 bit */ 98 117 99 118 typedef struct { 100 119 u64 v; ··· 134 123 { 135 124 p->v++; 136 125 } 137 - #endif 138 126 139 - #if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) 140 - #define u64_stats_init(syncp) seqcount_init(&(syncp)->seq) 141 - #else 142 127 static inline void u64_stats_init(struct u64_stats_sync *syncp) 143 128 { 129 + seqcount_init(&syncp->seq); 144 130 } 145 - #endif 146 131 147 - static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) 132 + static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp) 148 133 { 149 - #if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) 150 - if (IS_ENABLED(CONFIG_PREEMPT_RT)) 151 - preempt_disable(); 134 + preempt_disable_nested(); 152 135 write_seqcount_begin(&syncp->seq); 153 - #endif 154 136 } 155 137 156 - static inline void u64_stats_update_end(struct u64_stats_sync *syncp) 138 + static inline void __u64_stats_update_end(struct u64_stats_sync *syncp) 157 139 { 158 - #if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) 159 140 write_seqcount_end(&syncp->seq); 160 - if (IS_ENABLED(CONFIG_PREEMPT_RT)) 161 - preempt_enable(); 162 - #endif 141 + preempt_enable_nested(); 163 142 } 164 143 165 - static inline unsigned long 166 - u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) 144 + static inline unsigned long __u64_stats_irqsave(void) 167 145 { 168 - unsigned long flags = 0; 146 + unsigned long flags; 169 147 170 - #if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) 171 - if (IS_ENABLED(CONFIG_PREEMPT_RT)) 172 - preempt_disable(); 173 - else 174 - local_irq_save(flags); 175 - write_seqcount_begin(&syncp->seq); 176 - #endif 148 + local_irq_save(flags); 177 149 return flags; 178 150 } 179 151 180 - static inline void 181 - u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, 182 - unsigned long flags) 152 + static inline void __u64_stats_irqrestore(unsigned long flags) 183 153 { 184 - #if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) 185 - write_seqcount_end(&syncp->seq); 186 - if (IS_ENABLED(CONFIG_PREEMPT_RT)) 187 - preempt_enable(); 188 - else 189 - local_irq_restore(flags); 190 - #endif 154 + local_irq_restore(flags); 191 155 } 192 156 193 157 static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) 194 158 { 195 - #if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) 196 159 return read_seqcount_begin(&syncp->seq); 197 - #else 198 - return 0; 199 - #endif 160 + } 161 + 162 + static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, 163 + unsigned int start) 164 + { 165 + return read_seqcount_retry(&syncp->seq, start); 166 + } 167 + #endif /* !64 bit */ 168 + 169 + static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) 170 + { 171 + __u64_stats_update_begin(syncp); 172 + } 173 + 174 + static inline void u64_stats_update_end(struct u64_stats_sync *syncp) 175 + { 176 + __u64_stats_update_end(syncp); 177 + } 178 + 179 + static inline unsigned long u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) 180 + { 181 + unsigned long flags = __u64_stats_irqsave(); 182 + 183 + __u64_stats_update_begin(syncp); 184 + return flags; 185 + } 186 + 187 + static inline void u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, 188 + unsigned long flags) 189 + { 190 + __u64_stats_update_end(syncp); 191 + __u64_stats_irqrestore(flags); 200 192 } 201 193 202 194 static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) 203 195 { 204 - #if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) 205 - preempt_disable(); 206 - #endif 207 196 return __u64_stats_fetch_begin(syncp); 208 - } 209 - 210 - static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, 211 - unsigned int start) 212 - { 213 - #if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) 214 - return read_seqcount_retry(&syncp->seq, start); 215 - #else 216 - return false; 217 - #endif 218 197 } 219 198 220 199 static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, 221 200 unsigned int start) 222 201 { 223 - #if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) 224 - preempt_enable(); 225 - #endif 226 202 return __u64_stats_fetch_retry(syncp, start); 227 203 } 228 204 229 - /* 230 - * In case irq handlers can update u64 counters, readers can use following helpers 231 - * - SMP 32bit arches use seqcount protection, irq safe. 232 - * - UP 32bit must disable irqs. 233 - * - 64bit have no problem atomically reading u64 values, irq safe. 234 - */ 205 + /* Obsolete interfaces */ 235 206 static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) 236 207 { 237 - #if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) 238 - preempt_disable(); 239 - #elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) 240 - local_irq_disable(); 241 - #endif 242 - return __u64_stats_fetch_begin(syncp); 208 + return u64_stats_fetch_begin(syncp); 243 209 } 244 210 245 211 static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, 246 212 unsigned int start) 247 213 { 248 - #if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) 249 - preempt_enable(); 250 - #elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) 251 - local_irq_enable(); 252 - #endif 253 - return __u64_stats_fetch_retry(syncp, start); 214 + return u64_stats_fetch_retry(syncp, start); 254 215 } 255 216 256 217 #endif /* _LINUX_U64_STATS_SYNC_H */
+3
lib/Kconfig.debug
··· 805 805 An architecture should select this when it can successfully 806 806 build and run DEBUG_VM_PGTABLE. 807 807 808 + config DEBUG_VM_IRQSOFF 809 + def_bool DEBUG_VM && !PREEMPT_RT 810 + 808 811 config DEBUG_VM 809 812 bool "Debug VM" 810 813 depends on DEBUG_KERNEL
+2
lib/flex_proportions.c
··· 70 70 */ 71 71 if (events <= 1) 72 72 return false; 73 + preempt_disable_nested(); 73 74 write_seqcount_begin(&p->sequence); 74 75 if (periods < 64) 75 76 events -= events >> periods; ··· 78 77 percpu_counter_add(&p->events, -events); 79 78 p->period += periods; 80 79 write_seqcount_end(&p->sequence); 80 + preempt_enable_nested(); 81 81 82 82 return true; 83 83 }
+6
mm/Kconfig
··· 579 579 it and then we would be really interested to hear about that at 580 580 linux-mm@kvack.org. 581 581 582 + config COMPACT_UNEVICTABLE_DEFAULT 583 + int 584 + depends on COMPACTION 585 + default 0 if PREEMPT_RT 586 + default 1 587 + 582 588 # 583 589 # support for free page reporting 584 590 config PAGE_REPORTING
+1 -5
mm/compaction.c
··· 1727 1727 * Allow userspace to control policy on scanning the unevictable LRU for 1728 1728 * compactable pages. 1729 1729 */ 1730 - #ifdef CONFIG_PREEMPT_RT 1731 - int sysctl_compact_unevictable_allowed __read_mostly = 0; 1732 - #else 1733 - int sysctl_compact_unevictable_allowed __read_mostly = 1; 1734 - #endif 1730 + int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT; 1735 1731 1736 1732 static inline void 1737 1733 update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
+6 -13
mm/memcontrol.c
··· 597 597 */ 598 598 static void memcg_stats_lock(void) 599 599 { 600 - #ifdef CONFIG_PREEMPT_RT 601 - preempt_disable(); 602 - #else 603 - VM_BUG_ON(!irqs_disabled()); 604 - #endif 600 + preempt_disable_nested(); 601 + VM_WARN_ON_IRQS_ENABLED(); 605 602 } 606 603 607 604 static void __memcg_stats_lock(void) 608 605 { 609 - #ifdef CONFIG_PREEMPT_RT 610 - preempt_disable(); 611 - #endif 606 + preempt_disable_nested(); 612 607 } 613 608 614 609 static void memcg_stats_unlock(void) 615 610 { 616 - #ifdef CONFIG_PREEMPT_RT 617 - preempt_enable(); 618 - #endif 611 + preempt_enable_nested(); 619 612 } 620 613 621 614 static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) ··· 708 715 * interrupt context while other caller need to have disabled interrupt. 709 716 */ 710 717 __memcg_stats_lock(); 711 - if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) { 718 + if (IS_ENABLED(CONFIG_DEBUG_VM)) { 712 719 switch (idx) { 713 720 case NR_ANON_MAPPED: 714 721 case NR_FILE_MAPPED: ··· 718 725 WARN_ON_ONCE(!in_task()); 719 726 break; 720 727 default: 721 - WARN_ON_ONCE(!irqs_disabled()); 728 + VM_WARN_ON_IRQS_ENABLED(); 722 729 } 723 730 } 724 731
+12 -24
mm/vmstat.c
··· 355 355 * CPU migrations and preemption potentially corrupts a counter so 356 356 * disable preemption. 357 357 */ 358 - if (IS_ENABLED(CONFIG_PREEMPT_RT)) 359 - preempt_disable(); 358 + preempt_disable_nested(); 360 359 361 360 x = delta + __this_cpu_read(*p); 362 361 ··· 367 368 } 368 369 __this_cpu_write(*p, x); 369 370 370 - if (IS_ENABLED(CONFIG_PREEMPT_RT)) 371 - preempt_enable(); 371 + preempt_enable_nested(); 372 372 } 373 373 EXPORT_SYMBOL(__mod_zone_page_state); 374 374 ··· 391 393 } 392 394 393 395 /* See __mod_node_page_state */ 394 - if (IS_ENABLED(CONFIG_PREEMPT_RT)) 395 - preempt_disable(); 396 + preempt_disable_nested(); 396 397 397 398 x = delta + __this_cpu_read(*p); 398 399 ··· 403 406 } 404 407 __this_cpu_write(*p, x); 405 408 406 - if (IS_ENABLED(CONFIG_PREEMPT_RT)) 407 - preempt_enable(); 409 + preempt_enable_nested(); 408 410 } 409 411 EXPORT_SYMBOL(__mod_node_page_state); 410 412 ··· 437 441 s8 v, t; 438 442 439 443 /* See __mod_node_page_state */ 440 - if (IS_ENABLED(CONFIG_PREEMPT_RT)) 441 - preempt_disable(); 444 + preempt_disable_nested(); 442 445 443 446 v = __this_cpu_inc_return(*p); 444 447 t = __this_cpu_read(pcp->stat_threshold); ··· 448 453 __this_cpu_write(*p, -overstep); 449 454 } 450 455 451 - if (IS_ENABLED(CONFIG_PREEMPT_RT)) 452 - preempt_enable(); 456 + preempt_enable_nested(); 453 457 } 454 458 455 459 void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) ··· 460 466 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); 461 467 462 468 /* See __mod_node_page_state */ 463 - if (IS_ENABLED(CONFIG_PREEMPT_RT)) 464 - preempt_disable(); 469 + preempt_disable_nested(); 465 470 466 471 v = __this_cpu_inc_return(*p); 467 472 t = __this_cpu_read(pcp->stat_threshold); ··· 471 478 __this_cpu_write(*p, -overstep); 472 479 } 473 480 474 - if (IS_ENABLED(CONFIG_PREEMPT_RT)) 475 - preempt_enable(); 481 + preempt_enable_nested(); 476 482 } 477 483 478 484 void __inc_zone_page_state(struct page *page, enum zone_stat_item item) ··· 493 501 s8 v, t; 494 502 495 503 /* See __mod_node_page_state */ 496 - if (IS_ENABLED(CONFIG_PREEMPT_RT)) 497 - preempt_disable(); 504 + preempt_disable_nested(); 498 505 499 506 v = __this_cpu_dec_return(*p); 500 507 t = __this_cpu_read(pcp->stat_threshold); ··· 504 513 __this_cpu_write(*p, overstep); 505 514 } 506 515 507 - if (IS_ENABLED(CONFIG_PREEMPT_RT)) 508 - preempt_enable(); 516 + preempt_enable_nested(); 509 517 } 510 518 511 519 void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) ··· 516 526 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); 517 527 518 528 /* See __mod_node_page_state */ 519 - if (IS_ENABLED(CONFIG_PREEMPT_RT)) 520 - preempt_disable(); 529 + preempt_disable_nested(); 521 530 522 531 v = __this_cpu_dec_return(*p); 523 532 t = __this_cpu_read(pcp->stat_threshold); ··· 527 538 __this_cpu_write(*p, overstep); 528 539 } 529 540 530 - if (IS_ENABLED(CONFIG_PREEMPT_RT)) 531 - preempt_enable(); 541 + preempt_enable_nested(); 532 542 } 533 543 534 544 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)