Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

futex: Create hb scopes

Create explicit scopes for hb variables; almost pure re-indent.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250416162921.513656-6-bigeasy@linutronix.de

+504 -485
+42 -39
kernel/futex/core.c
··· 944 944 { 945 945 struct list_head *next, *head = &curr->pi_state_list; 946 946 struct futex_pi_state *pi_state; 947 - struct futex_hash_bucket *hb; 948 947 union futex_key key = FUTEX_KEY_INIT; 949 948 950 949 /* ··· 956 957 next = head->next; 957 958 pi_state = list_entry(next, struct futex_pi_state, list); 958 959 key = pi_state->key; 959 - hb = futex_hash(&key); 960 + if (1) { 961 + struct futex_hash_bucket *hb; 960 962 961 - /* 962 - * We can race against put_pi_state() removing itself from the 963 - * list (a waiter going away). put_pi_state() will first 964 - * decrement the reference count and then modify the list, so 965 - * its possible to see the list entry but fail this reference 966 - * acquire. 967 - * 968 - * In that case; drop the locks to let put_pi_state() make 969 - * progress and retry the loop. 970 - */ 971 - if (!refcount_inc_not_zero(&pi_state->refcount)) { 963 + hb = futex_hash(&key); 964 + 965 + /* 966 + * We can race against put_pi_state() removing itself from the 967 + * list (a waiter going away). put_pi_state() will first 968 + * decrement the reference count and then modify the list, so 969 + * its possible to see the list entry but fail this reference 970 + * acquire. 971 + * 972 + * In that case; drop the locks to let put_pi_state() make 973 + * progress and retry the loop. 974 + */ 975 + if (!refcount_inc_not_zero(&pi_state->refcount)) { 976 + raw_spin_unlock_irq(&curr->pi_lock); 977 + cpu_relax(); 978 + raw_spin_lock_irq(&curr->pi_lock); 979 + continue; 980 + } 972 981 raw_spin_unlock_irq(&curr->pi_lock); 973 - cpu_relax(); 974 - raw_spin_lock_irq(&curr->pi_lock); 975 - continue; 976 - } 977 - raw_spin_unlock_irq(&curr->pi_lock); 978 982 979 - spin_lock(&hb->lock); 980 - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 981 - raw_spin_lock(&curr->pi_lock); 982 - /* 983 - * We dropped the pi-lock, so re-check whether this 984 - * task still owns the PI-state: 985 - */ 986 - if (head->next != next) { 987 - /* retain curr->pi_lock for the loop invariant */ 988 - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); 983 + spin_lock(&hb->lock); 984 + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 985 + raw_spin_lock(&curr->pi_lock); 986 + /* 987 + * We dropped the pi-lock, so re-check whether this 988 + * task still owns the PI-state: 989 + */ 990 + if (head->next != next) { 991 + /* retain curr->pi_lock for the loop invariant */ 992 + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); 993 + spin_unlock(&hb->lock); 994 + put_pi_state(pi_state); 995 + continue; 996 + } 997 + 998 + WARN_ON(pi_state->owner != curr); 999 + WARN_ON(list_empty(&pi_state->list)); 1000 + list_del_init(&pi_state->list); 1001 + pi_state->owner = NULL; 1002 + 1003 + raw_spin_unlock(&curr->pi_lock); 1004 + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 989 1005 spin_unlock(&hb->lock); 990 - put_pi_state(pi_state); 991 - continue; 992 1006 } 993 - 994 - WARN_ON(pi_state->owner != curr); 995 - WARN_ON(list_empty(&pi_state->list)); 996 - list_del_init(&pi_state->list); 997 - pi_state->owner = NULL; 998 - 999 - raw_spin_unlock(&curr->pi_lock); 1000 - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1001 - spin_unlock(&hb->lock); 1002 1007 1003 1008 rt_mutex_futex_unlock(&pi_state->pi_mutex); 1004 1009 put_pi_state(pi_state);
+143 -139
kernel/futex/pi.c
··· 920 920 struct hrtimer_sleeper timeout, *to; 921 921 struct task_struct *exiting = NULL; 922 922 struct rt_mutex_waiter rt_waiter; 923 - struct futex_hash_bucket *hb; 924 923 struct futex_q q = futex_q_init; 925 924 DEFINE_WAKE_Q(wake_q); 926 925 int res, ret; ··· 938 939 goto out; 939 940 940 941 retry_private: 941 - hb = futex_hash(&q.key); 942 - futex_q_lock(&q, hb); 942 + if (1) { 943 + struct futex_hash_bucket *hb; 943 944 944 - ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 945 - &exiting, 0); 946 - if (unlikely(ret)) { 947 - /* 948 - * Atomic work succeeded and we got the lock, 949 - * or failed. Either way, we do _not_ block. 950 - */ 951 - switch (ret) { 952 - case 1: 953 - /* We got the lock. */ 954 - ret = 0; 955 - goto out_unlock_put_key; 956 - case -EFAULT: 957 - goto uaddr_faulted; 958 - case -EBUSY: 959 - case -EAGAIN: 945 + hb = futex_hash(&q.key); 946 + futex_q_lock(&q, hb); 947 + 948 + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 949 + &exiting, 0); 950 + if (unlikely(ret)) { 960 951 /* 961 - * Two reasons for this: 962 - * - EBUSY: Task is exiting and we just wait for the 963 - * exit to complete. 964 - * - EAGAIN: The user space value changed. 952 + * Atomic work succeeded and we got the lock, 953 + * or failed. Either way, we do _not_ block. 965 954 */ 966 - futex_q_unlock(hb); 967 - /* 968 - * Handle the case where the owner is in the middle of 969 - * exiting. Wait for the exit to complete otherwise 970 - * this task might loop forever, aka. live lock. 971 - */ 972 - wait_for_owner_exiting(ret, exiting); 973 - cond_resched(); 974 - goto retry; 975 - default: 976 - goto out_unlock_put_key; 955 + switch (ret) { 956 + case 1: 957 + /* We got the lock. */ 958 + ret = 0; 959 + goto out_unlock_put_key; 960 + case -EFAULT: 961 + goto uaddr_faulted; 962 + case -EBUSY: 963 + case -EAGAIN: 964 + /* 965 + * Two reasons for this: 966 + * - EBUSY: Task is exiting and we just wait for the 967 + * exit to complete. 968 + * - EAGAIN: The user space value changed. 969 + */ 970 + futex_q_unlock(hb); 971 + /* 972 + * Handle the case where the owner is in the middle of 973 + * exiting. Wait for the exit to complete otherwise 974 + * this task might loop forever, aka. live lock. 975 + */ 976 + wait_for_owner_exiting(ret, exiting); 977 + cond_resched(); 978 + goto retry; 979 + default: 980 + goto out_unlock_put_key; 981 + } 977 982 } 978 - } 979 983 980 - WARN_ON(!q.pi_state); 984 + WARN_ON(!q.pi_state); 981 985 982 - /* 983 - * Only actually queue now that the atomic ops are done: 984 - */ 985 - __futex_queue(&q, hb, current); 986 + /* 987 + * Only actually queue now that the atomic ops are done: 988 + */ 989 + __futex_queue(&q, hb, current); 986 990 987 - if (trylock) { 988 - ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); 989 - /* Fixup the trylock return value: */ 990 - ret = ret ? 0 : -EWOULDBLOCK; 991 - goto no_block; 992 - } 991 + if (trylock) { 992 + ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); 993 + /* Fixup the trylock return value: */ 994 + ret = ret ? 0 : -EWOULDBLOCK; 995 + goto no_block; 996 + } 993 997 994 - /* 995 - * Must be done before we enqueue the waiter, here is unfortunately 996 - * under the hb lock, but that *should* work because it does nothing. 997 - */ 998 - rt_mutex_pre_schedule(); 998 + /* 999 + * Must be done before we enqueue the waiter, here is unfortunately 1000 + * under the hb lock, but that *should* work because it does nothing. 1001 + */ 1002 + rt_mutex_pre_schedule(); 999 1003 1000 - rt_mutex_init_waiter(&rt_waiter); 1004 + rt_mutex_init_waiter(&rt_waiter); 1001 1005 1002 - /* 1003 - * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not 1004 - * hold it while doing rt_mutex_start_proxy(), because then it will 1005 - * include hb->lock in the blocking chain, even through we'll not in 1006 - * fact hold it while blocking. This will lead it to report -EDEADLK 1007 - * and BUG when futex_unlock_pi() interleaves with this. 1008 - * 1009 - * Therefore acquire wait_lock while holding hb->lock, but drop the 1010 - * latter before calling __rt_mutex_start_proxy_lock(). This 1011 - * interleaves with futex_unlock_pi() -- which does a similar lock 1012 - * handoff -- such that the latter can observe the futex_q::pi_state 1013 - * before __rt_mutex_start_proxy_lock() is done. 1014 - */ 1015 - raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); 1016 - spin_unlock(q.lock_ptr); 1017 - /* 1018 - * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter 1019 - * such that futex_unlock_pi() is guaranteed to observe the waiter when 1020 - * it sees the futex_q::pi_state. 1021 - */ 1022 - ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); 1023 - raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q); 1006 + /* 1007 + * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not 1008 + * hold it while doing rt_mutex_start_proxy(), because then it will 1009 + * include hb->lock in the blocking chain, even through we'll not in 1010 + * fact hold it while blocking. This will lead it to report -EDEADLK 1011 + * and BUG when futex_unlock_pi() interleaves with this. 1012 + * 1013 + * Therefore acquire wait_lock while holding hb->lock, but drop the 1014 + * latter before calling __rt_mutex_start_proxy_lock(). This 1015 + * interleaves with futex_unlock_pi() -- which does a similar lock 1016 + * handoff -- such that the latter can observe the futex_q::pi_state 1017 + * before __rt_mutex_start_proxy_lock() is done. 1018 + */ 1019 + raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); 1020 + spin_unlock(q.lock_ptr); 1021 + /* 1022 + * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter 1023 + * such that futex_unlock_pi() is guaranteed to observe the waiter when 1024 + * it sees the futex_q::pi_state. 1025 + */ 1026 + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); 1027 + raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q); 1024 1028 1025 - if (ret) { 1026 - if (ret == 1) 1027 - ret = 0; 1028 - goto cleanup; 1029 - } 1029 + if (ret) { 1030 + if (ret == 1) 1031 + ret = 0; 1032 + goto cleanup; 1033 + } 1030 1034 1031 - if (unlikely(to)) 1032 - hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); 1035 + if (unlikely(to)) 1036 + hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); 1033 1037 1034 - ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); 1038 + ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); 1035 1039 1036 1040 cleanup: 1037 - /* 1038 - * If we failed to acquire the lock (deadlock/signal/timeout), we must 1039 - * must unwind the above, however we canont lock hb->lock because 1040 - * rt_mutex already has a waiter enqueued and hb->lock can itself try 1041 - * and enqueue an rt_waiter through rtlock. 1042 - * 1043 - * Doing the cleanup without holding hb->lock can cause inconsistent 1044 - * state between hb and pi_state, but only in the direction of not 1045 - * seeing a waiter that is leaving. 1046 - * 1047 - * See futex_unlock_pi(), it deals with this inconsistency. 1048 - * 1049 - * There be dragons here, since we must deal with the inconsistency on 1050 - * the way out (here), it is impossible to detect/warn about the race 1051 - * the other way around (missing an incoming waiter). 1052 - * 1053 - * What could possibly go wrong... 1054 - */ 1055 - if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) 1056 - ret = 0; 1041 + /* 1042 + * If we failed to acquire the lock (deadlock/signal/timeout), we must 1043 + * unwind the above, however we canont lock hb->lock because 1044 + * rt_mutex already has a waiter enqueued and hb->lock can itself try 1045 + * and enqueue an rt_waiter through rtlock. 1046 + * 1047 + * Doing the cleanup without holding hb->lock can cause inconsistent 1048 + * state between hb and pi_state, but only in the direction of not 1049 + * seeing a waiter that is leaving. 1050 + * 1051 + * See futex_unlock_pi(), it deals with this inconsistency. 1052 + * 1053 + * There be dragons here, since we must deal with the inconsistency on 1054 + * the way out (here), it is impossible to detect/warn about the race 1055 + * the other way around (missing an incoming waiter). 1056 + * 1057 + * What could possibly go wrong... 1058 + */ 1059 + if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) 1060 + ret = 0; 1057 1061 1058 - /* 1059 - * Now that the rt_waiter has been dequeued, it is safe to use 1060 - * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up 1061 - * the 1062 - */ 1063 - spin_lock(q.lock_ptr); 1064 - /* 1065 - * Waiter is unqueued. 1066 - */ 1067 - rt_mutex_post_schedule(); 1062 + /* 1063 + * Now that the rt_waiter has been dequeued, it is safe to use 1064 + * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up 1065 + * the 1066 + */ 1067 + spin_lock(q.lock_ptr); 1068 + /* 1069 + * Waiter is unqueued. 1070 + */ 1071 + rt_mutex_post_schedule(); 1068 1072 no_block: 1069 - /* 1070 - * Fixup the pi_state owner and possibly acquire the lock if we 1071 - * haven't already. 1072 - */ 1073 - res = fixup_pi_owner(uaddr, &q, !ret); 1074 - /* 1075 - * If fixup_pi_owner() returned an error, propagate that. If it acquired 1076 - * the lock, clear our -ETIMEDOUT or -EINTR. 1077 - */ 1078 - if (res) 1079 - ret = (res < 0) ? res : 0; 1073 + /* 1074 + * Fixup the pi_state owner and possibly acquire the lock if we 1075 + * haven't already. 1076 + */ 1077 + res = fixup_pi_owner(uaddr, &q, !ret); 1078 + /* 1079 + * If fixup_pi_owner() returned an error, propagate that. If it acquired 1080 + * the lock, clear our -ETIMEDOUT or -EINTR. 1081 + */ 1082 + if (res) 1083 + ret = (res < 0) ? res : 0; 1080 1084 1081 - futex_unqueue_pi(&q); 1082 - spin_unlock(q.lock_ptr); 1083 - goto out; 1085 + futex_unqueue_pi(&q); 1086 + spin_unlock(q.lock_ptr); 1087 + goto out; 1084 1088 1085 1089 out_unlock_put_key: 1086 - futex_q_unlock(hb); 1090 + futex_q_unlock(hb); 1091 + goto out; 1092 + 1093 + uaddr_faulted: 1094 + futex_q_unlock(hb); 1095 + 1096 + ret = fault_in_user_writeable(uaddr); 1097 + if (ret) 1098 + goto out; 1099 + 1100 + if (!(flags & FLAGS_SHARED)) 1101 + goto retry_private; 1102 + 1103 + goto retry; 1104 + } 1087 1105 1088 1106 out: 1089 1107 if (to) { ··· 1108 1092 destroy_hrtimer_on_stack(&to->timer); 1109 1093 } 1110 1094 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1111 - 1112 - uaddr_faulted: 1113 - futex_q_unlock(hb); 1114 - 1115 - ret = fault_in_user_writeable(uaddr); 1116 - if (ret) 1117 - goto out; 1118 - 1119 - if (!(flags & FLAGS_SHARED)) 1120 - goto retry_private; 1121 - 1122 - goto retry; 1123 1095 } 1124 1096 1125 1097 /*
+218 -215
kernel/futex/requeue.c
··· 371 371 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 372 372 int task_count = 0, ret; 373 373 struct futex_pi_state *pi_state = NULL; 374 - struct futex_hash_bucket *hb1, *hb2; 375 374 struct futex_q *this, *next; 376 375 DEFINE_WAKE_Q(wake_q); 377 376 ··· 442 443 if (requeue_pi && futex_match(&key1, &key2)) 443 444 return -EINVAL; 444 445 445 - hb1 = futex_hash(&key1); 446 - hb2 = futex_hash(&key2); 447 - 448 446 retry_private: 449 - futex_hb_waiters_inc(hb2); 450 - double_lock_hb(hb1, hb2); 447 + if (1) { 448 + struct futex_hash_bucket *hb1, *hb2; 451 449 452 - if (likely(cmpval != NULL)) { 453 - u32 curval; 450 + hb1 = futex_hash(&key1); 451 + hb2 = futex_hash(&key2); 454 452 455 - ret = futex_get_value_locked(&curval, uaddr1); 453 + futex_hb_waiters_inc(hb2); 454 + double_lock_hb(hb1, hb2); 456 455 457 - if (unlikely(ret)) { 458 - double_unlock_hb(hb1, hb2); 459 - futex_hb_waiters_dec(hb2); 456 + if (likely(cmpval != NULL)) { 457 + u32 curval; 460 458 461 - ret = get_user(curval, uaddr1); 462 - if (ret) 463 - return ret; 459 + ret = futex_get_value_locked(&curval, uaddr1); 464 460 465 - if (!(flags1 & FLAGS_SHARED)) 466 - goto retry_private; 461 + if (unlikely(ret)) { 462 + double_unlock_hb(hb1, hb2); 463 + futex_hb_waiters_dec(hb2); 467 464 468 - goto retry; 469 - } 470 - if (curval != *cmpval) { 471 - ret = -EAGAIN; 472 - goto out_unlock; 473 - } 474 - } 465 + ret = get_user(curval, uaddr1); 466 + if (ret) 467 + return ret; 475 468 476 - if (requeue_pi) { 477 - struct task_struct *exiting = NULL; 469 + if (!(flags1 & FLAGS_SHARED)) 470 + goto retry_private; 478 471 479 - /* 480 - * Attempt to acquire uaddr2 and wake the top waiter. If we 481 - * intend to requeue waiters, force setting the FUTEX_WAITERS 482 - * bit. We force this here where we are able to easily handle 483 - * faults rather in the requeue loop below. 484 - * 485 - * Updates topwaiter::requeue_state if a top waiter exists. 486 - */ 487 - ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, 488 - &key2, &pi_state, 489 - &exiting, nr_requeue); 490 - 491 - /* 492 - * At this point the top_waiter has either taken uaddr2 or 493 - * is waiting on it. In both cases pi_state has been 494 - * established and an initial refcount on it. In case of an 495 - * error there's nothing. 496 - * 497 - * The top waiter's requeue_state is up to date: 498 - * 499 - * - If the lock was acquired atomically (ret == 1), then 500 - * the state is Q_REQUEUE_PI_LOCKED. 501 - * 502 - * The top waiter has been dequeued and woken up and can 503 - * return to user space immediately. The kernel/user 504 - * space state is consistent. In case that there must be 505 - * more waiters requeued the WAITERS bit in the user 506 - * space futex is set so the top waiter task has to go 507 - * into the syscall slowpath to unlock the futex. This 508 - * will block until this requeue operation has been 509 - * completed and the hash bucket locks have been 510 - * dropped. 511 - * 512 - * - If the trylock failed with an error (ret < 0) then 513 - * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing 514 - * happened", or Q_REQUEUE_PI_IGNORE when there was an 515 - * interleaved early wakeup. 516 - * 517 - * - If the trylock did not succeed (ret == 0) then the 518 - * state is either Q_REQUEUE_PI_IN_PROGRESS or 519 - * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. 520 - * This will be cleaned up in the loop below, which 521 - * cannot fail because futex_proxy_trylock_atomic() did 522 - * the same sanity checks for requeue_pi as the loop 523 - * below does. 524 - */ 525 - switch (ret) { 526 - case 0: 527 - /* We hold a reference on the pi state. */ 528 - break; 529 - 530 - case 1: 531 - /* 532 - * futex_proxy_trylock_atomic() acquired the user space 533 - * futex. Adjust task_count. 534 - */ 535 - task_count++; 536 - ret = 0; 537 - break; 538 - 539 - /* 540 - * If the above failed, then pi_state is NULL and 541 - * waiter::requeue_state is correct. 542 - */ 543 - case -EFAULT: 544 - double_unlock_hb(hb1, hb2); 545 - futex_hb_waiters_dec(hb2); 546 - ret = fault_in_user_writeable(uaddr2); 547 - if (!ret) 548 472 goto retry; 549 - return ret; 550 - case -EBUSY: 551 - case -EAGAIN: 552 - /* 553 - * Two reasons for this: 554 - * - EBUSY: Owner is exiting and we just wait for the 555 - * exit to complete. 556 - * - EAGAIN: The user space value changed. 557 - */ 558 - double_unlock_hb(hb1, hb2); 559 - futex_hb_waiters_dec(hb2); 560 - /* 561 - * Handle the case where the owner is in the middle of 562 - * exiting. Wait for the exit to complete otherwise 563 - * this task might loop forever, aka. live lock. 564 - */ 565 - wait_for_owner_exiting(ret, exiting); 566 - cond_resched(); 567 - goto retry; 568 - default: 569 - goto out_unlock; 570 - } 571 - } 572 - 573 - plist_for_each_entry_safe(this, next, &hb1->chain, list) { 574 - if (task_count - nr_wake >= nr_requeue) 575 - break; 576 - 577 - if (!futex_match(&this->key, &key1)) 578 - continue; 579 - 580 - /* 581 - * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always 582 - * be paired with each other and no other futex ops. 583 - * 584 - * We should never be requeueing a futex_q with a pi_state, 585 - * which is awaiting a futex_unlock_pi(). 586 - */ 587 - if ((requeue_pi && !this->rt_waiter) || 588 - (!requeue_pi && this->rt_waiter) || 589 - this->pi_state) { 590 - ret = -EINVAL; 591 - break; 473 + } 474 + if (curval != *cmpval) { 475 + ret = -EAGAIN; 476 + goto out_unlock; 477 + } 592 478 } 593 479 594 - /* Plain futexes just wake or requeue and are done */ 595 - if (!requeue_pi) { 596 - if (++task_count <= nr_wake) 597 - this->wake(&wake_q, this); 598 - else 480 + if (requeue_pi) { 481 + struct task_struct *exiting = NULL; 482 + 483 + /* 484 + * Attempt to acquire uaddr2 and wake the top waiter. If we 485 + * intend to requeue waiters, force setting the FUTEX_WAITERS 486 + * bit. We force this here where we are able to easily handle 487 + * faults rather in the requeue loop below. 488 + * 489 + * Updates topwaiter::requeue_state if a top waiter exists. 490 + */ 491 + ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, 492 + &key2, &pi_state, 493 + &exiting, nr_requeue); 494 + 495 + /* 496 + * At this point the top_waiter has either taken uaddr2 or 497 + * is waiting on it. In both cases pi_state has been 498 + * established and an initial refcount on it. In case of an 499 + * error there's nothing. 500 + * 501 + * The top waiter's requeue_state is up to date: 502 + * 503 + * - If the lock was acquired atomically (ret == 1), then 504 + * the state is Q_REQUEUE_PI_LOCKED. 505 + * 506 + * The top waiter has been dequeued and woken up and can 507 + * return to user space immediately. The kernel/user 508 + * space state is consistent. In case that there must be 509 + * more waiters requeued the WAITERS bit in the user 510 + * space futex is set so the top waiter task has to go 511 + * into the syscall slowpath to unlock the futex. This 512 + * will block until this requeue operation has been 513 + * completed and the hash bucket locks have been 514 + * dropped. 515 + * 516 + * - If the trylock failed with an error (ret < 0) then 517 + * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing 518 + * happened", or Q_REQUEUE_PI_IGNORE when there was an 519 + * interleaved early wakeup. 520 + * 521 + * - If the trylock did not succeed (ret == 0) then the 522 + * state is either Q_REQUEUE_PI_IN_PROGRESS or 523 + * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. 524 + * This will be cleaned up in the loop below, which 525 + * cannot fail because futex_proxy_trylock_atomic() did 526 + * the same sanity checks for requeue_pi as the loop 527 + * below does. 528 + */ 529 + switch (ret) { 530 + case 0: 531 + /* We hold a reference on the pi state. */ 532 + break; 533 + 534 + case 1: 535 + /* 536 + * futex_proxy_trylock_atomic() acquired the user space 537 + * futex. Adjust task_count. 538 + */ 539 + task_count++; 540 + ret = 0; 541 + break; 542 + 543 + /* 544 + * If the above failed, then pi_state is NULL and 545 + * waiter::requeue_state is correct. 546 + */ 547 + case -EFAULT: 548 + double_unlock_hb(hb1, hb2); 549 + futex_hb_waiters_dec(hb2); 550 + ret = fault_in_user_writeable(uaddr2); 551 + if (!ret) 552 + goto retry; 553 + return ret; 554 + case -EBUSY: 555 + case -EAGAIN: 556 + /* 557 + * Two reasons for this: 558 + * - EBUSY: Owner is exiting and we just wait for the 559 + * exit to complete. 560 + * - EAGAIN: The user space value changed. 561 + */ 562 + double_unlock_hb(hb1, hb2); 563 + futex_hb_waiters_dec(hb2); 564 + /* 565 + * Handle the case where the owner is in the middle of 566 + * exiting. Wait for the exit to complete otherwise 567 + * this task might loop forever, aka. live lock. 568 + */ 569 + wait_for_owner_exiting(ret, exiting); 570 + cond_resched(); 571 + goto retry; 572 + default: 573 + goto out_unlock; 574 + } 575 + } 576 + 577 + plist_for_each_entry_safe(this, next, &hb1->chain, list) { 578 + if (task_count - nr_wake >= nr_requeue) 579 + break; 580 + 581 + if (!futex_match(&this->key, &key1)) 582 + continue; 583 + 584 + /* 585 + * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always 586 + * be paired with each other and no other futex ops. 587 + * 588 + * We should never be requeueing a futex_q with a pi_state, 589 + * which is awaiting a futex_unlock_pi(). 590 + */ 591 + if ((requeue_pi && !this->rt_waiter) || 592 + (!requeue_pi && this->rt_waiter) || 593 + this->pi_state) { 594 + ret = -EINVAL; 595 + break; 596 + } 597 + 598 + /* Plain futexes just wake or requeue and are done */ 599 + if (!requeue_pi) { 600 + if (++task_count <= nr_wake) 601 + this->wake(&wake_q, this); 602 + else 603 + requeue_futex(this, hb1, hb2, &key2); 604 + continue; 605 + } 606 + 607 + /* Ensure we requeue to the expected futex for requeue_pi. */ 608 + if (!futex_match(this->requeue_pi_key, &key2)) { 609 + ret = -EINVAL; 610 + break; 611 + } 612 + 613 + /* 614 + * Requeue nr_requeue waiters and possibly one more in the case 615 + * of requeue_pi if we couldn't acquire the lock atomically. 616 + * 617 + * Prepare the waiter to take the rt_mutex. Take a refcount 618 + * on the pi_state and store the pointer in the futex_q 619 + * object of the waiter. 620 + */ 621 + get_pi_state(pi_state); 622 + 623 + /* Don't requeue when the waiter is already on the way out. */ 624 + if (!futex_requeue_pi_prepare(this, pi_state)) { 625 + /* 626 + * Early woken waiter signaled that it is on the 627 + * way out. Drop the pi_state reference and try the 628 + * next waiter. @this->pi_state is still NULL. 629 + */ 630 + put_pi_state(pi_state); 631 + continue; 632 + } 633 + 634 + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, 635 + this->rt_waiter, 636 + this->task); 637 + 638 + if (ret == 1) { 639 + /* 640 + * We got the lock. We do neither drop the refcount 641 + * on pi_state nor clear this->pi_state because the 642 + * waiter needs the pi_state for cleaning up the 643 + * user space value. It will drop the refcount 644 + * after doing so. this::requeue_state is updated 645 + * in the wakeup as well. 646 + */ 647 + requeue_pi_wake_futex(this, &key2, hb2); 648 + task_count++; 649 + } else if (!ret) { 650 + /* Waiter is queued, move it to hb2 */ 599 651 requeue_futex(this, hb1, hb2, &key2); 600 - continue; 601 - } 602 - 603 - /* Ensure we requeue to the expected futex for requeue_pi. */ 604 - if (!futex_match(this->requeue_pi_key, &key2)) { 605 - ret = -EINVAL; 606 - break; 652 + futex_requeue_pi_complete(this, 0); 653 + task_count++; 654 + } else { 655 + /* 656 + * rt_mutex_start_proxy_lock() detected a potential 657 + * deadlock when we tried to queue that waiter. 658 + * Drop the pi_state reference which we took above 659 + * and remove the pointer to the state from the 660 + * waiters futex_q object. 661 + */ 662 + this->pi_state = NULL; 663 + put_pi_state(pi_state); 664 + futex_requeue_pi_complete(this, ret); 665 + /* 666 + * We stop queueing more waiters and let user space 667 + * deal with the mess. 668 + */ 669 + break; 670 + } 607 671 } 608 672 609 673 /* 610 - * Requeue nr_requeue waiters and possibly one more in the case 611 - * of requeue_pi if we couldn't acquire the lock atomically. 612 - * 613 - * Prepare the waiter to take the rt_mutex. Take a refcount 614 - * on the pi_state and store the pointer in the futex_q 615 - * object of the waiter. 674 + * We took an extra initial reference to the pi_state in 675 + * futex_proxy_trylock_atomic(). We need to drop it here again. 616 676 */ 617 - get_pi_state(pi_state); 618 - 619 - /* Don't requeue when the waiter is already on the way out. */ 620 - if (!futex_requeue_pi_prepare(this, pi_state)) { 621 - /* 622 - * Early woken waiter signaled that it is on the 623 - * way out. Drop the pi_state reference and try the 624 - * next waiter. @this->pi_state is still NULL. 625 - */ 626 - put_pi_state(pi_state); 627 - continue; 628 - } 629 - 630 - ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, 631 - this->rt_waiter, 632 - this->task); 633 - 634 - if (ret == 1) { 635 - /* 636 - * We got the lock. We do neither drop the refcount 637 - * on pi_state nor clear this->pi_state because the 638 - * waiter needs the pi_state for cleaning up the 639 - * user space value. It will drop the refcount 640 - * after doing so. this::requeue_state is updated 641 - * in the wakeup as well. 642 - */ 643 - requeue_pi_wake_futex(this, &key2, hb2); 644 - task_count++; 645 - } else if (!ret) { 646 - /* Waiter is queued, move it to hb2 */ 647 - requeue_futex(this, hb1, hb2, &key2); 648 - futex_requeue_pi_complete(this, 0); 649 - task_count++; 650 - } else { 651 - /* 652 - * rt_mutex_start_proxy_lock() detected a potential 653 - * deadlock when we tried to queue that waiter. 654 - * Drop the pi_state reference which we took above 655 - * and remove the pointer to the state from the 656 - * waiters futex_q object. 657 - */ 658 - this->pi_state = NULL; 659 - put_pi_state(pi_state); 660 - futex_requeue_pi_complete(this, ret); 661 - /* 662 - * We stop queueing more waiters and let user space 663 - * deal with the mess. 664 - */ 665 - break; 666 - } 667 - } 668 - 669 - /* 670 - * We took an extra initial reference to the pi_state in 671 - * futex_proxy_trylock_atomic(). We need to drop it here again. 672 - */ 673 - put_pi_state(pi_state); 677 + put_pi_state(pi_state); 674 678 675 679 out_unlock: 676 - double_unlock_hb(hb1, hb2); 680 + double_unlock_hb(hb1, hb2); 681 + futex_hb_waiters_dec(hb2); 682 + } 677 683 wake_up_q(&wake_q); 678 - futex_hb_waiters_dec(hb2); 679 684 return ret ? ret : task_count; 680 685 } 681 686
+101 -92
kernel/futex/waitwake.c
··· 253 253 int nr_wake, int nr_wake2, int op) 254 254 { 255 255 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 256 - struct futex_hash_bucket *hb1, *hb2; 257 256 struct futex_q *this, *next; 258 257 int ret, op_ret; 259 258 DEFINE_WAKE_Q(wake_q); ··· 265 266 if (unlikely(ret != 0)) 266 267 return ret; 267 268 268 - hb1 = futex_hash(&key1); 269 - hb2 = futex_hash(&key2); 270 - 271 269 retry_private: 272 - double_lock_hb(hb1, hb2); 273 - op_ret = futex_atomic_op_inuser(op, uaddr2); 274 - if (unlikely(op_ret < 0)) { 275 - double_unlock_hb(hb1, hb2); 270 + if (1) { 271 + struct futex_hash_bucket *hb1, *hb2; 276 272 277 - if (!IS_ENABLED(CONFIG_MMU) || 278 - unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { 279 - /* 280 - * we don't get EFAULT from MMU faults if we don't have 281 - * an MMU, but we might get them from range checking 282 - */ 283 - ret = op_ret; 284 - return ret; 285 - } 273 + hb1 = futex_hash(&key1); 274 + hb2 = futex_hash(&key2); 286 275 287 - if (op_ret == -EFAULT) { 288 - ret = fault_in_user_writeable(uaddr2); 289 - if (ret) 276 + double_lock_hb(hb1, hb2); 277 + op_ret = futex_atomic_op_inuser(op, uaddr2); 278 + if (unlikely(op_ret < 0)) { 279 + double_unlock_hb(hb1, hb2); 280 + 281 + if (!IS_ENABLED(CONFIG_MMU) || 282 + unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { 283 + /* 284 + * we don't get EFAULT from MMU faults if we don't have 285 + * an MMU, but we might get them from range checking 286 + */ 287 + ret = op_ret; 290 288 return ret; 291 - } 292 - 293 - cond_resched(); 294 - if (!(flags & FLAGS_SHARED)) 295 - goto retry_private; 296 - goto retry; 297 - } 298 - 299 - plist_for_each_entry_safe(this, next, &hb1->chain, list) { 300 - if (futex_match (&this->key, &key1)) { 301 - if (this->pi_state || this->rt_waiter) { 302 - ret = -EINVAL; 303 - goto out_unlock; 304 289 } 305 - this->wake(&wake_q, this); 306 - if (++ret >= nr_wake) 307 - break; 308 - } 309 - } 310 290 311 - if (op_ret > 0) { 312 - op_ret = 0; 313 - plist_for_each_entry_safe(this, next, &hb2->chain, list) { 314 - if (futex_match (&this->key, &key2)) { 291 + if (op_ret == -EFAULT) { 292 + ret = fault_in_user_writeable(uaddr2); 293 + if (ret) 294 + return ret; 295 + } 296 + 297 + cond_resched(); 298 + if (!(flags & FLAGS_SHARED)) 299 + goto retry_private; 300 + goto retry; 301 + } 302 + 303 + plist_for_each_entry_safe(this, next, &hb1->chain, list) { 304 + if (futex_match(&this->key, &key1)) { 315 305 if (this->pi_state || this->rt_waiter) { 316 306 ret = -EINVAL; 317 307 goto out_unlock; 318 308 } 319 309 this->wake(&wake_q, this); 320 - if (++op_ret >= nr_wake2) 310 + if (++ret >= nr_wake) 321 311 break; 322 312 } 323 313 } 324 - ret += op_ret; 325 - } 314 + 315 + if (op_ret > 0) { 316 + op_ret = 0; 317 + plist_for_each_entry_safe(this, next, &hb2->chain, list) { 318 + if (futex_match(&this->key, &key2)) { 319 + if (this->pi_state || this->rt_waiter) { 320 + ret = -EINVAL; 321 + goto out_unlock; 322 + } 323 + this->wake(&wake_q, this); 324 + if (++op_ret >= nr_wake2) 325 + break; 326 + } 327 + } 328 + ret += op_ret; 329 + } 326 330 327 331 out_unlock: 328 - double_unlock_hb(hb1, hb2); 332 + double_unlock_hb(hb1, hb2); 333 + } 329 334 wake_up_q(&wake_q); 330 335 return ret; 331 336 } ··· 405 402 */ 406 403 int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) 407 404 { 408 - struct futex_hash_bucket *hb; 409 405 bool retry = false; 410 406 int ret, i; 411 407 u32 uval; ··· 443 441 struct futex_q *q = &vs[i].q; 444 442 u32 val = vs[i].w.val; 445 443 446 - hb = futex_hash(&q->key); 447 - futex_q_lock(q, hb); 448 - ret = futex_get_value_locked(&uval, uaddr); 444 + if (1) { 445 + struct futex_hash_bucket *hb; 449 446 450 - if (!ret && uval == val) { 451 - /* 452 - * The bucket lock can't be held while dealing with the 453 - * next futex. Queue each futex at this moment so hb can 454 - * be unlocked. 455 - */ 456 - futex_queue(q, hb, current); 457 - continue; 447 + hb = futex_hash(&q->key); 448 + futex_q_lock(q, hb); 449 + ret = futex_get_value_locked(&uval, uaddr); 450 + 451 + if (!ret && uval == val) { 452 + /* 453 + * The bucket lock can't be held while dealing with the 454 + * next futex. Queue each futex at this moment so hb can 455 + * be unlocked. 456 + */ 457 + futex_queue(q, hb, current); 458 + continue; 459 + } 460 + 461 + futex_q_unlock(hb); 458 462 } 459 - 460 - futex_q_unlock(hb); 461 463 __set_current_state(TASK_RUNNING); 462 464 463 465 /* ··· 590 584 struct futex_q *q, union futex_key *key2, 591 585 struct task_struct *task) 592 586 { 593 - struct futex_hash_bucket *hb; 594 587 u32 uval; 595 588 int ret; 596 589 ··· 617 612 return ret; 618 613 619 614 retry_private: 620 - hb = futex_hash(&q->key); 621 - futex_q_lock(q, hb); 615 + if (1) { 616 + struct futex_hash_bucket *hb; 622 617 623 - ret = futex_get_value_locked(&uval, uaddr); 618 + hb = futex_hash(&q->key); 619 + futex_q_lock(q, hb); 624 620 625 - if (ret) { 626 - futex_q_unlock(hb); 621 + ret = futex_get_value_locked(&uval, uaddr); 627 622 628 - ret = get_user(uval, uaddr); 629 - if (ret) 630 - return ret; 623 + if (ret) { 624 + futex_q_unlock(hb); 631 625 632 - if (!(flags & FLAGS_SHARED)) 633 - goto retry_private; 626 + ret = get_user(uval, uaddr); 627 + if (ret) 628 + return ret; 634 629 635 - goto retry; 630 + if (!(flags & FLAGS_SHARED)) 631 + goto retry_private; 632 + 633 + goto retry; 634 + } 635 + 636 + if (uval != val) { 637 + futex_q_unlock(hb); 638 + return -EWOULDBLOCK; 639 + } 640 + 641 + if (key2 && futex_match(&q->key, key2)) { 642 + futex_q_unlock(hb); 643 + return -EINVAL; 644 + } 645 + 646 + /* 647 + * The task state is guaranteed to be set before another task can 648 + * wake it. set_current_state() is implemented using smp_store_mb() and 649 + * futex_queue() calls spin_unlock() upon completion, both serializing 650 + * access to the hash list and forcing another memory barrier. 651 + */ 652 + if (task == current) 653 + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); 654 + futex_queue(q, hb, task); 636 655 } 637 - 638 - if (uval != val) { 639 - futex_q_unlock(hb); 640 - return -EWOULDBLOCK; 641 - } 642 - 643 - if (key2 && futex_match(&q->key, key2)) { 644 - futex_q_unlock(hb); 645 - return -EINVAL; 646 - } 647 - 648 - /* 649 - * The task state is guaranteed to be set before another task can 650 - * wake it. set_current_state() is implemented using smp_store_mb() and 651 - * futex_queue() calls spin_unlock() upon completion, both serializing 652 - * access to the hash list and forcing another memory barrier. 653 - */ 654 - if (task == current) 655 - set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); 656 - futex_queue(q, hb, task); 657 656 658 657 return ret; 659 658 }