Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'locking_urgent_for_v6.5_rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull locking fix from Borislav Petkov:

- Fix a rtmutex race condition resulting from sharing of the sort key
between the lock waiters and the PI chain tree (->pi_waiters) of a
task by giving each tree their own sort key

* tag 'locking_urgent_for_v6.5_rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
locking/rtmutex: Fix task->pi_waiters integrity

+156 -77
+115 -57
kernel/locking/rtmutex.c
··· 333 333 return prio; 334 334 } 335 335 336 + /* 337 + * Update the waiter->tree copy of the sort keys. 338 + */ 336 339 static __always_inline void 337 340 waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) 338 341 { 339 - waiter->prio = __waiter_prio(task); 340 - waiter->deadline = task->dl.deadline; 342 + lockdep_assert_held(&waiter->lock->wait_lock); 343 + lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry)); 344 + 345 + waiter->tree.prio = __waiter_prio(task); 346 + waiter->tree.deadline = task->dl.deadline; 341 347 } 342 348 343 349 /* 344 - * Only use with rt_mutex_waiter_{less,equal}() 350 + * Update the waiter->pi_tree copy of the sort keys (from the tree copy). 345 351 */ 346 - #define task_to_waiter(p) \ 347 - &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } 352 + static __always_inline void 353 + waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) 354 + { 355 + lockdep_assert_held(&waiter->lock->wait_lock); 356 + lockdep_assert_held(&task->pi_lock); 357 + lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry)); 348 358 349 - static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, 350 - struct rt_mutex_waiter *right) 359 + waiter->pi_tree.prio = waiter->tree.prio; 360 + waiter->pi_tree.deadline = waiter->tree.deadline; 361 + } 362 + 363 + /* 364 + * Only use with rt_waiter_node_{less,equal}() 365 + */ 366 + #define task_to_waiter_node(p) \ 367 + &(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } 368 + #define task_to_waiter(p) \ 369 + &(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) } 370 + 371 + static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left, 372 + struct rt_waiter_node *right) 351 373 { 352 374 if (left->prio < right->prio) 353 375 return 1; ··· 386 364 return 0; 387 365 } 388 366 389 - static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, 390 - struct rt_mutex_waiter *right) 367 + static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left, 368 + struct rt_waiter_node *right) 391 369 { 392 370 if (left->prio != right->prio) 393 371 return 0; ··· 407 385 static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, 408 386 struct rt_mutex_waiter *top_waiter) 409 387 { 410 - if (rt_mutex_waiter_less(waiter, top_waiter)) 388 + if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree)) 411 389 return true; 412 390 413 391 #ifdef RT_MUTEX_BUILD_SPINLOCKS ··· 415 393 * Note that RT tasks are excluded from same priority (lateral) 416 394 * steals to prevent the introduction of an unbounded latency. 417 395 */ 418 - if (rt_prio(waiter->prio) || dl_prio(waiter->prio)) 396 + if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio)) 419 397 return false; 420 398 421 - return rt_mutex_waiter_equal(waiter, top_waiter); 399 + return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree); 422 400 #else 423 401 return false; 424 402 #endif 425 403 } 426 404 427 405 #define __node_2_waiter(node) \ 428 - rb_entry((node), struct rt_mutex_waiter, tree_entry) 406 + rb_entry((node), struct rt_mutex_waiter, tree.entry) 429 407 430 408 static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b) 431 409 { 432 410 struct rt_mutex_waiter *aw = __node_2_waiter(a); 433 411 struct rt_mutex_waiter *bw = __node_2_waiter(b); 434 412 435 - if (rt_mutex_waiter_less(aw, bw)) 413 + if (rt_waiter_node_less(&aw->tree, &bw->tree)) 436 414 return 1; 437 415 438 416 if (!build_ww_mutex()) 439 417 return 0; 440 418 441 - if (rt_mutex_waiter_less(bw, aw)) 419 + if (rt_waiter_node_less(&bw->tree, &aw->tree)) 442 420 return 0; 443 421 444 422 /* NOTE: relies on waiter->ww_ctx being set before insertion */ ··· 456 434 static __always_inline void 457 435 rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) 458 436 { 459 - rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less); 437 + lockdep_assert_held(&lock->wait_lock); 438 + 439 + rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less); 460 440 } 461 441 462 442 static __always_inline void 463 443 rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) 464 444 { 465 - if (RB_EMPTY_NODE(&waiter->tree_entry)) 445 + lockdep_assert_held(&lock->wait_lock); 446 + 447 + if (RB_EMPTY_NODE(&waiter->tree.entry)) 466 448 return; 467 449 468 - rb_erase_cached(&waiter->tree_entry, &lock->waiters); 469 - RB_CLEAR_NODE(&waiter->tree_entry); 450 + rb_erase_cached(&waiter->tree.entry, &lock->waiters); 451 + RB_CLEAR_NODE(&waiter->tree.entry); 470 452 } 471 453 472 - #define __node_2_pi_waiter(node) \ 473 - rb_entry((node), struct rt_mutex_waiter, pi_tree_entry) 454 + #define __node_2_rt_node(node) \ 455 + rb_entry((node), struct rt_waiter_node, entry) 474 456 475 - static __always_inline bool 476 - __pi_waiter_less(struct rb_node *a, const struct rb_node *b) 457 + static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b) 477 458 { 478 - return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b)); 459 + return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b)); 479 460 } 480 461 481 462 static __always_inline void 482 463 rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) 483 464 { 484 - rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less); 465 + lockdep_assert_held(&task->pi_lock); 466 + 467 + rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less); 485 468 } 486 469 487 470 static __always_inline void 488 471 rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) 489 472 { 490 - if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) 473 + lockdep_assert_held(&task->pi_lock); 474 + 475 + if (RB_EMPTY_NODE(&waiter->pi_tree.entry)) 491 476 return; 492 477 493 - rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters); 494 - RB_CLEAR_NODE(&waiter->pi_tree_entry); 478 + rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters); 479 + RB_CLEAR_NODE(&waiter->pi_tree.entry); 495 480 } 496 481 497 - static __always_inline void rt_mutex_adjust_prio(struct task_struct *p) 482 + static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock, 483 + struct task_struct *p) 498 484 { 499 485 struct task_struct *pi_task = NULL; 500 486 487 + lockdep_assert_held(&lock->wait_lock); 488 + lockdep_assert(rt_mutex_owner(lock) == p); 501 489 lockdep_assert_held(&p->pi_lock); 502 490 503 491 if (task_has_pi_waiters(p)) ··· 603 571 * Chain walk basics and protection scope 604 572 * 605 573 * [R] refcount on task 606 - * [P] task->pi_lock held 574 + * [Pn] task->pi_lock held 607 575 * [L] rtmutex->wait_lock held 576 + * 577 + * Normal locking order: 578 + * 579 + * rtmutex->wait_lock 580 + * task->pi_lock 608 581 * 609 582 * Step Description Protected by 610 583 * function arguments: ··· 625 588 * again: 626 589 * loop_sanity_check(); 627 590 * retry: 628 - * [1] lock(task->pi_lock); [R] acquire [P] 629 - * [2] waiter = task->pi_blocked_on; [P] 630 - * [3] check_exit_conditions_1(); [P] 631 - * [4] lock = waiter->lock; [P] 632 - * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L] 633 - * unlock(task->pi_lock); release [P] 591 + * [1] lock(task->pi_lock); [R] acquire [P1] 592 + * [2] waiter = task->pi_blocked_on; [P1] 593 + * [3] check_exit_conditions_1(); [P1] 594 + * [4] lock = waiter->lock; [P1] 595 + * [5] if (!try_lock(lock->wait_lock)) { [P1] try to acquire [L] 596 + * unlock(task->pi_lock); release [P1] 634 597 * goto retry; 635 598 * } 636 - * [6] check_exit_conditions_2(); [P] + [L] 637 - * [7] requeue_lock_waiter(lock, waiter); [P] + [L] 638 - * [8] unlock(task->pi_lock); release [P] 599 + * [6] check_exit_conditions_2(); [P1] + [L] 600 + * [7] requeue_lock_waiter(lock, waiter); [P1] + [L] 601 + * [8] unlock(task->pi_lock); release [P1] 639 602 * put_task_struct(task); release [R] 640 603 * [9] check_exit_conditions_3(); [L] 641 604 * [10] task = owner(lock); [L] 642 605 * get_task_struct(task); [L] acquire [R] 643 - * lock(task->pi_lock); [L] acquire [P] 644 - * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L] 645 - * [12] check_exit_conditions_4(); [P] + [L] 646 - * [13] unlock(task->pi_lock); release [P] 606 + * lock(task->pi_lock); [L] acquire [P2] 607 + * [11] requeue_pi_waiter(tsk, waiters(lock));[P2] + [L] 608 + * [12] check_exit_conditions_4(); [P2] + [L] 609 + * [13] unlock(task->pi_lock); release [P2] 647 610 * unlock(lock->wait_lock); release [L] 648 611 * goto again; 612 + * 613 + * Where P1 is the blocking task and P2 is the lock owner; going up one step 614 + * the owner becomes the next blocked task etc.. 615 + * 616 + * 649 617 */ 650 618 static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, 651 619 enum rtmutex_chainwalk chwalk, ··· 798 756 * enabled we continue, but stop the requeueing in the chain 799 757 * walk. 800 758 */ 801 - if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { 759 + if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) { 802 760 if (!detect_deadlock) 803 761 goto out_unlock_pi; 804 762 else ··· 806 764 } 807 765 808 766 /* 809 - * [4] Get the next lock 767 + * [4] Get the next lock; per holding task->pi_lock we can't unblock 768 + * and guarantee @lock's existence. 810 769 */ 811 770 lock = waiter->lock; 812 771 /* 813 772 * [5] We need to trylock here as we are holding task->pi_lock, 814 773 * which is the reverse lock order versus the other rtmutex 815 774 * operations. 775 + * 776 + * Per the above, holding task->pi_lock guarantees lock exists, so 777 + * inverting this lock order is infeasible from a life-time 778 + * perspective. 816 779 */ 817 780 if (!raw_spin_trylock(&lock->wait_lock)) { 818 781 raw_spin_unlock_irq(&task->pi_lock); ··· 921 874 * or 922 875 * 923 876 * DL CBS enforcement advancing the effective deadline. 924 - * 925 - * Even though pi_waiters also uses these fields, and that tree is only 926 - * updated in [11], we can do this here, since we hold [L], which 927 - * serializes all pi_waiters access and rb_erase() does not care about 928 - * the values of the node being removed. 929 877 */ 930 878 waiter_update_prio(waiter, task); 931 879 932 880 rt_mutex_enqueue(lock, waiter); 933 881 934 - /* [8] Release the task */ 882 + /* 883 + * [8] Release the (blocking) task in preparation for 884 + * taking the owner task in [10]. 885 + * 886 + * Since we hold lock->waiter_lock, task cannot unblock, even if we 887 + * release task->pi_lock. 888 + */ 935 889 raw_spin_unlock(&task->pi_lock); 936 890 put_task_struct(task); 937 891 ··· 956 908 return 0; 957 909 } 958 910 959 - /* [10] Grab the next task, i.e. the owner of @lock */ 911 + /* 912 + * [10] Grab the next task, i.e. the owner of @lock 913 + * 914 + * Per holding lock->wait_lock and checking for !owner above, there 915 + * must be an owner and it cannot go away. 916 + */ 960 917 task = get_task_struct(rt_mutex_owner(lock)); 961 918 raw_spin_lock(&task->pi_lock); 962 919 ··· 974 921 * and adjust the priority of the owner. 975 922 */ 976 923 rt_mutex_dequeue_pi(task, prerequeue_top_waiter); 924 + waiter_clone_prio(waiter, task); 977 925 rt_mutex_enqueue_pi(task, waiter); 978 - rt_mutex_adjust_prio(task); 926 + rt_mutex_adjust_prio(lock, task); 979 927 980 928 } else if (prerequeue_top_waiter == waiter) { 981 929 /* ··· 991 937 */ 992 938 rt_mutex_dequeue_pi(task, waiter); 993 939 waiter = rt_mutex_top_waiter(lock); 940 + waiter_clone_prio(waiter, task); 994 941 rt_mutex_enqueue_pi(task, waiter); 995 - rt_mutex_adjust_prio(task); 942 + rt_mutex_adjust_prio(lock, task); 996 943 } else { 997 944 /* 998 945 * Nothing changed. No need to do any priority ··· 1209 1154 waiter->task = task; 1210 1155 waiter->lock = lock; 1211 1156 waiter_update_prio(waiter, task); 1157 + waiter_clone_prio(waiter, task); 1212 1158 1213 1159 /* Get the top priority waiter on the lock */ 1214 1160 if (rt_mutex_has_waiters(lock)) ··· 1243 1187 rt_mutex_dequeue_pi(owner, top_waiter); 1244 1188 rt_mutex_enqueue_pi(owner, waiter); 1245 1189 1246 - rt_mutex_adjust_prio(owner); 1190 + rt_mutex_adjust_prio(lock, owner); 1247 1191 if (owner->pi_blocked_on) 1248 1192 chain_walk = 1; 1249 1193 } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { ··· 1290 1234 { 1291 1235 struct rt_mutex_waiter *waiter; 1292 1236 1237 + lockdep_assert_held(&lock->wait_lock); 1238 + 1293 1239 raw_spin_lock(&current->pi_lock); 1294 1240 1295 1241 waiter = rt_mutex_top_waiter(lock); ··· 1304 1246 * task unblocks. 1305 1247 */ 1306 1248 rt_mutex_dequeue_pi(current, waiter); 1307 - rt_mutex_adjust_prio(current); 1249 + rt_mutex_adjust_prio(lock, current); 1308 1250 1309 1251 /* 1310 1252 * As we are waking up the top waiter, and the waiter stays ··· 1540 1482 if (rt_mutex_has_waiters(lock)) 1541 1483 rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); 1542 1484 1543 - rt_mutex_adjust_prio(owner); 1485 + rt_mutex_adjust_prio(lock, owner); 1544 1486 1545 1487 /* Store the lock on which owner is blocked or NULL */ 1546 1488 next_lock = task_blocked_on_lock(owner);
+1 -1
kernel/locking/rtmutex_api.c
··· 459 459 raw_spin_lock_irqsave(&task->pi_lock, flags); 460 460 461 461 waiter = task->pi_blocked_on; 462 - if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { 462 + if (!waiter || rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) { 463 463 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 464 464 return; 465 465 }
+34 -13
kernel/locking/rtmutex_common.h
··· 17 17 #include <linux/rtmutex.h> 18 18 #include <linux/sched/wake_q.h> 19 19 20 + 21 + /* 22 + * This is a helper for the struct rt_mutex_waiter below. A waiter goes in two 23 + * separate trees and they need their own copy of the sort keys because of 24 + * different locking requirements. 25 + * 26 + * @entry: rbtree node to enqueue into the waiters tree 27 + * @prio: Priority of the waiter 28 + * @deadline: Deadline of the waiter if applicable 29 + * 30 + * See rt_waiter_node_less() and waiter_*_prio(). 31 + */ 32 + struct rt_waiter_node { 33 + struct rb_node entry; 34 + int prio; 35 + u64 deadline; 36 + }; 37 + 20 38 /* 21 39 * This is the control structure for tasks blocked on a rt_mutex, 22 40 * which is allocated on the kernel stack on of the blocked task. 23 41 * 24 - * @tree_entry: pi node to enqueue into the mutex waiters tree 25 - * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree 42 + * @tree: node to enqueue into the mutex waiters tree 43 + * @pi_tree: node to enqueue into the mutex owner waiters tree 26 44 * @task: task reference to the blocked task 27 45 * @lock: Pointer to the rt_mutex on which the waiter blocks 28 46 * @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT) 29 - * @prio: Priority of the waiter 30 - * @deadline: Deadline of the waiter if applicable 31 47 * @ww_ctx: WW context pointer 48 + * 49 + * @tree is ordered by @lock->wait_lock 50 + * @pi_tree is ordered by rt_mutex_owner(@lock)->pi_lock 32 51 */ 33 52 struct rt_mutex_waiter { 34 - struct rb_node tree_entry; 35 - struct rb_node pi_tree_entry; 53 + struct rt_waiter_node tree; 54 + struct rt_waiter_node pi_tree; 36 55 struct task_struct *task; 37 56 struct rt_mutex_base *lock; 38 57 unsigned int wake_state; 39 - int prio; 40 - u64 deadline; 41 58 struct ww_acquire_ctx *ww_ctx; 42 59 }; 43 60 ··· 122 105 { 123 106 struct rb_node *leftmost = rb_first_cached(&lock->waiters); 124 107 125 - return rb_entry(leftmost, struct rt_mutex_waiter, tree_entry) == waiter; 108 + return rb_entry(leftmost, struct rt_mutex_waiter, tree.entry) == waiter; 126 109 } 127 110 128 111 static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock) ··· 130 113 struct rb_node *leftmost = rb_first_cached(&lock->waiters); 131 114 struct rt_mutex_waiter *w = NULL; 132 115 116 + lockdep_assert_held(&lock->wait_lock); 117 + 133 118 if (leftmost) { 134 - w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry); 119 + w = rb_entry(leftmost, struct rt_mutex_waiter, tree.entry); 135 120 BUG_ON(w->lock != lock); 136 121 } 137 122 return w; ··· 146 127 147 128 static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p) 148 129 { 130 + lockdep_assert_held(&p->pi_lock); 131 + 149 132 return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter, 150 - pi_tree_entry); 133 + pi_tree.entry); 151 134 } 152 135 153 136 #define RT_MUTEX_HAS_WAITERS 1UL ··· 211 190 static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) 212 191 { 213 192 debug_rt_mutex_init_waiter(waiter); 214 - RB_CLEAR_NODE(&waiter->pi_tree_entry); 215 - RB_CLEAR_NODE(&waiter->tree_entry); 193 + RB_CLEAR_NODE(&waiter->pi_tree.entry); 194 + RB_CLEAR_NODE(&waiter->tree.entry); 216 195 waiter->wake_state = TASK_NORMAL; 217 196 waiter->task = NULL; 218 197 }
+6 -6
kernel/locking/ww_mutex.h
··· 96 96 struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root); 97 97 if (!n) 98 98 return NULL; 99 - return rb_entry(n, struct rt_mutex_waiter, tree_entry); 99 + return rb_entry(n, struct rt_mutex_waiter, tree.entry); 100 100 } 101 101 102 102 static inline struct rt_mutex_waiter * 103 103 __ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w) 104 104 { 105 - struct rb_node *n = rb_next(&w->tree_entry); 105 + struct rb_node *n = rb_next(&w->tree.entry); 106 106 if (!n) 107 107 return NULL; 108 - return rb_entry(n, struct rt_mutex_waiter, tree_entry); 108 + return rb_entry(n, struct rt_mutex_waiter, tree.entry); 109 109 } 110 110 111 111 static inline struct rt_mutex_waiter * 112 112 __ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w) 113 113 { 114 - struct rb_node *n = rb_prev(&w->tree_entry); 114 + struct rb_node *n = rb_prev(&w->tree.entry); 115 115 if (!n) 116 116 return NULL; 117 - return rb_entry(n, struct rt_mutex_waiter, tree_entry); 117 + return rb_entry(n, struct rt_mutex_waiter, tree.entry); 118 118 } 119 119 120 120 static inline struct rt_mutex_waiter * ··· 123 123 struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root); 124 124 if (!n) 125 125 return NULL; 126 - return rb_entry(n, struct rt_mutex_waiter, tree_entry); 126 + return rb_entry(n, struct rt_mutex_waiter, tree.entry); 127 127 } 128 128 129 129 static inline void