Merge tag 'locking-core-2025-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+34 -2

include/linux/futex.h

··· 4 4 5 5 #include <linux/sched.h> 6 6 #include <linux/ktime.h> 7 + #include <linux/mm_types.h> 7 8 8 9 #include <uapi/linux/futex.h> 9 10 10 11 struct inode; 11 - struct mm_struct; 12 12 struct task_struct; 13 13 14 14 /* ··· 34 34 u64 i_seq; 35 35 unsigned long pgoff; 36 36 unsigned int offset; 37 + /* unsigned int node; */ 37 38 } shared; 38 39 struct { 39 40 union { ··· 43 42 }; 44 43 unsigned long address; 45 44 unsigned int offset; 45 + /* unsigned int node; */ 46 46 } private; 47 47 struct { 48 48 u64 ptr; 49 49 unsigned long word; 50 50 unsigned int offset; 51 + unsigned int node; /* NOT hashed! */ 51 52 } both; 52 53 }; 53 54 ··· 80 77 81 78 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 82 79 u32 __user *uaddr2, u32 val2, u32 val3); 83 - #else 80 + int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4); 81 + 82 + #ifdef CONFIG_FUTEX_PRIVATE_HASH 83 + int futex_hash_allocate_default(void); 84 + void futex_hash_free(struct mm_struct *mm); 85 + 86 + static inline void futex_mm_init(struct mm_struct *mm) 87 + { 88 + RCU_INIT_POINTER(mm->futex_phash, NULL); 89 + mutex_init(&mm->futex_hash_lock); 90 + } 91 + 92 + #else /* !CONFIG_FUTEX_PRIVATE_HASH */ 93 + static inline int futex_hash_allocate_default(void) { return 0; } 94 + static inline void futex_hash_free(struct mm_struct *mm) { } 95 + static inline void futex_mm_init(struct mm_struct *mm) { } 96 + #endif /* CONFIG_FUTEX_PRIVATE_HASH */ 97 + 98 + #else /* !CONFIG_FUTEX */ 84 99 static inline void futex_init_task(struct task_struct *tsk) { } 85 100 static inline void futex_exit_recursive(struct task_struct *tsk) { } 86 101 static inline void futex_exit_release(struct task_struct *tsk) { } ··· 109 88 { 110 89 return -EINVAL; 111 90 } 91 + static inline int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) 92 + { 93 + return -EINVAL; 94 + } 95 + static inline int futex_hash_allocate_default(void) 96 + { 97 + return 0; 98 + } 99 + static inline void futex_hash_free(struct mm_struct *mm) { } 100 + static inline void futex_mm_init(struct mm_struct *mm) { } 101 + 112 102 #endif 113 103 114 104 #endif

+6 -1

include/linux/mm_types.h

··· 31 31 #define INIT_PASID 0 32 32 33 33 struct address_space; 34 + struct futex_private_hash; 34 35 struct mem_cgroup; 35 36 36 37 /* ··· 1032 1031 */ 1033 1032 seqcount_t mm_lock_seq; 1034 1033 #endif 1035 - 1034 + #ifdef CONFIG_FUTEX_PRIVATE_HASH 1035 + struct mutex futex_hash_lock; 1036 + struct futex_private_hash __rcu *futex_phash; 1037 + struct futex_private_hash *futex_phash_new; 1038 + #endif 1036 1039 1037 1040 unsigned long hiwater_rss; /* High-watermark of RSS usage */ 1038 1041 unsigned long hiwater_vm; /* High-water virtual memory usage */

+4

include/linux/mmap_lock.h

··· 7 7 #include <linux/rwsem.h> 8 8 #include <linux/tracepoint-defs.h> 9 9 #include <linux/types.h> 10 + #include <linux/cleanup.h> 10 11 11 12 #define MMAP_LOCK_INITIALIZER(name) \ 12 13 .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), ··· 211 210 __mmap_lock_trace_released(mm, false); 212 211 up_read(&mm->mmap_lock); 213 212 } 213 + 214 + DEFINE_GUARD(mmap_read_lock, struct mm_struct *, 215 + mmap_read_lock(_T), mmap_read_unlock(_T)) 214 216 215 217 static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) 216 218 {

+21 -1

include/linux/rcuref.h

··· 30 30 * rcuref_read - Read the number of held reference counts of a rcuref 31 31 * @ref: Pointer to the reference count 32 32 * 33 - * Return: The number of held references (0 ... N) 33 + * Return: The number of held references (0 ... N). The value 0 does not 34 + * indicate that it is safe to schedule the object, protected by this reference 35 + * counter, for deconstruction. 36 + * If you want to know if the reference counter has been marked DEAD (as 37 + * signaled by rcuref_put()) please use rcuread_is_dead(). 34 38 */ 35 39 static inline unsigned int rcuref_read(rcuref_t *ref) 36 40 { ··· 42 38 43 39 /* Return 0 if within the DEAD zone. */ 44 40 return c >= RCUREF_RELEASED ? 0 : c + 1; 41 + } 42 + 43 + /** 44 + * rcuref_is_dead - Check if the rcuref has been already marked dead 45 + * @ref: Pointer to the reference count 46 + * 47 + * Return: True if the object has been marked DEAD. This signals that a previous 48 + * invocation of rcuref_put() returned true on this reference counter meaning 49 + * the protected object can safely be scheduled for deconstruction. 50 + * Otherwise, returns false. 51 + */ 52 + static inline bool rcuref_is_dead(rcuref_t *ref) 53 + { 54 + unsigned int c = atomic_read(&ref->refcnt); 55 + 56 + return (c >= RCUREF_RELEASED) && (c < RCUREF_NOREF); 45 57 } 46 58 47 59 extern __must_check bool rcuref_get_slowpath(rcuref_t *ref);

+1 -1

include/linux/restart_block.h

··· 26 26 unsigned long arch_data; 27 27 long (*fn)(struct restart_block *); 28 28 union { 29 - /* For futex_wait and futex_wait_requeue_pi */ 29 + /* For futex_wait() */ 30 30 struct { 31 31 u32 __user *uaddr; 32 32 u32 val;

+7 -2

include/linux/vmalloc.h

··· 169 169 int node, const void *caller) __alloc_size(1); 170 170 #define __vmalloc_node(...) alloc_hooks(__vmalloc_node_noprof(__VA_ARGS__)) 171 171 172 - void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1); 173 - #define vmalloc_huge(...) alloc_hooks(vmalloc_huge_noprof(__VA_ARGS__)) 172 + void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) __alloc_size(1); 173 + #define vmalloc_huge_node(...) alloc_hooks(vmalloc_huge_node_noprof(__VA_ARGS__)) 174 + 175 + static inline void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) 176 + { 177 + return vmalloc_huge_node(size, gfp_mask, NUMA_NO_NODE); 178 + } 174 179 175 180 extern void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); 176 181 #define __vmalloc_array(...) alloc_hooks(__vmalloc_array_noprof(__VA_ARGS__))

+8 -1

include/uapi/linux/futex.h

··· 63 63 #define FUTEX2_SIZE_U32 0x02 64 64 #define FUTEX2_SIZE_U64 0x03 65 65 #define FUTEX2_NUMA 0x04 66 - /* 0x08 */ 66 + #define FUTEX2_MPOL 0x08 67 67 /* 0x10 */ 68 68 /* 0x20 */ 69 69 /* 0x40 */ ··· 73 73 74 74 /* do not use */ 75 75 #define FUTEX_32 FUTEX2_SIZE_U32 /* historical accident :-( */ 76 + 77 + /* 78 + * When FUTEX2_NUMA doubles the futex word, the second word is a node value. 79 + * The special value -1 indicates no-node. This is the same value as 80 + * NUMA_NO_NODE, except that value is not ABI, this is. 81 + */ 82 + #define FUTEX_NO_NODE (-1) 76 83 77 84 /* 78 85 * Max numbers of elements in a futex_waitv array

+7

include/uapi/linux/prctl.h

··· 364 364 # define PR_TIMER_CREATE_RESTORE_IDS_ON 1 365 365 # define PR_TIMER_CREATE_RESTORE_IDS_GET 2 366 366 367 + /* FUTEX hash management */ 368 + #define PR_FUTEX_HASH 78 369 + # define PR_FUTEX_HASH_SET_SLOTS 1 370 + # define FH_FLAG_IMMUTABLE (1ULL << 0) 371 + # define PR_FUTEX_HASH_GET_SLOTS 2 372 + # define PR_FUTEX_HASH_GET_IMMUTABLE 3 373 + 367 374 #endif /* _LINUX_PRCTL_H */

+10

init/Kconfig

··· 1687 1687 depends on FUTEX && RT_MUTEXES 1688 1688 default y 1689 1689 1690 + config FUTEX_PRIVATE_HASH 1691 + bool 1692 + depends on FUTEX && !BASE_SMALL && MMU 1693 + default y 1694 + 1695 + config FUTEX_MPOL 1696 + bool 1697 + depends on FUTEX && NUMA 1698 + default y 1699 + 1690 1700 config EPOLL 1691 1701 bool "Enable eventpoll support" if EXPERT 1692 1702 default y

+1 -3

io_uring/futex.c

··· 273 273 struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); 274 274 struct io_ring_ctx *ctx = req->ctx; 275 275 struct io_futex_data *ifd = NULL; 276 - struct futex_hash_bucket *hb; 277 276 int ret; 278 277 279 278 if (!iof->futex_mask) { ··· 294 295 ifd->req = req; 295 296 296 297 ret = futex_wait_setup(iof->uaddr, iof->futex_val, iof->futex_flags, 297 - &ifd->q, &hb); 298 + &ifd->q, NULL, NULL); 298 299 if (!ret) { 299 300 hlist_add_head(&req->hash_node, &ctx->futex_list); 300 301 io_ring_submit_unlock(ctx, issue_flags); 301 302 302 - futex_queue(&ifd->q, hb, NULL); 303 303 return IOU_ISSUE_SKIP_COMPLETE; 304 304 } 305 305

+24

kernel/fork.c

··· 1306 1306 RCU_INIT_POINTER(mm->exe_file, NULL); 1307 1307 mmu_notifier_subscriptions_init(mm); 1308 1308 init_tlb_flush_pending(mm); 1309 + futex_mm_init(mm); 1309 1310 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) 1310 1311 mm->pmd_huge_pte = NULL; 1311 1312 #endif ··· 1389 1388 if (mm->binfmt) 1390 1389 module_put(mm->binfmt->module); 1391 1390 lru_gen_del_mm(mm); 1391 + futex_hash_free(mm); 1392 1392 mmdrop(mm); 1393 1393 } 1394 1394 ··· 2155 2153 #define rv_task_fork(p) do {} while (0) 2156 2154 #endif 2157 2155 2156 + static bool need_futex_hash_allocate_default(u64 clone_flags) 2157 + { 2158 + if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM)) 2159 + return false; 2160 + return true; 2161 + } 2162 + 2158 2163 /* 2159 2164 * This creates a new process as a copy of the old one, 2160 2165 * but does not actually start it yet. ··· 2542 2533 if (retval) 2543 2534 goto bad_fork_cancel_cgroup; 2544 2535 2536 + /* 2537 + * Allocate a default futex hash for the user process once the first 2538 + * thread spawns. 2539 + */ 2540 + if (need_futex_hash_allocate_default(clone_flags)) { 2541 + retval = futex_hash_allocate_default(); 2542 + if (retval) 2543 + goto bad_fork_core_free; 2544 + /* 2545 + * If we fail beyond this point we don't free the allocated 2546 + * futex hash map. We assume that another thread will be created 2547 + * and makes use of it. The hash map will be freed once the main 2548 + * thread terminates. 2549 + */ 2550 + } 2545 2551 /* 2546 2552 * From this point on we must avoid any synchronous user-space 2547 2553 * communication until we take the tasklist-lock. In particular, we do

+736 -75

kernel/futex/core.c

··· 36 36 #include <linux/pagemap.h> 37 37 #include <linux/debugfs.h> 38 38 #include <linux/plist.h> 39 + #include <linux/gfp.h> 40 + #include <linux/vmalloc.h> 39 41 #include <linux/memblock.h> 40 42 #include <linux/fault-inject.h> 41 43 #include <linux/slab.h> 44 + #include <linux/prctl.h> 45 + #include <linux/rcuref.h> 46 + #include <linux/mempolicy.h> 47 + #include <linux/mmap_lock.h> 42 48 43 49 #include "futex.h" 44 50 #include "../locking/rtmutex_common.h" ··· 55 49 * reside in the same cacheline. 56 50 */ 57 51 static struct { 58 - struct futex_hash_bucket *queues; 59 52 unsigned long hashmask; 53 + unsigned int hashshift; 54 + struct futex_hash_bucket *queues[MAX_NUMNODES]; 60 55 } __futex_data __read_mostly __aligned(2*sizeof(long)); 61 - #define futex_queues (__futex_data.queues) 62 - #define futex_hashmask (__futex_data.hashmask) 63 56 57 + #define futex_hashmask (__futex_data.hashmask) 58 + #define futex_hashshift (__futex_data.hashshift) 59 + #define futex_queues (__futex_data.queues) 60 + 61 + struct futex_private_hash { 62 + rcuref_t users; 63 + unsigned int hash_mask; 64 + struct rcu_head rcu; 65 + void *mm; 66 + bool custom; 67 + bool immutable; 68 + struct futex_hash_bucket queues[]; 69 + }; 64 70 65 71 /* 66 72 * Fault injections for futexes. ··· 125 107 126 108 #endif /* CONFIG_FAIL_FUTEX */ 127 109 128 - /** 129 - * futex_hash - Return the hash bucket in the global hash 130 - * @key: Pointer to the futex key for which the hash is calculated 131 - * 132 - * We hash on the keys returned from get_futex_key (see below) and return the 133 - * corresponding hash bucket in the global hash. 134 - */ 135 - struct futex_hash_bucket *futex_hash(union futex_key *key) 136 - { 137 - u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, 138 - key->both.offset); 110 + static struct futex_hash_bucket * 111 + __futex_hash(union futex_key *key, struct futex_private_hash *fph); 139 112 140 - return &futex_queues[hash & futex_hashmask]; 113 + #ifdef CONFIG_FUTEX_PRIVATE_HASH 114 + static inline bool futex_key_is_private(union futex_key *key) 115 + { 116 + /* 117 + * Relies on get_futex_key() to set either bit for shared 118 + * futexes -- see comment with union futex_key. 119 + */ 120 + return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED)); 141 121 } 142 122 123 + bool futex_private_hash_get(struct futex_private_hash *fph) 124 + { 125 + if (fph->immutable) 126 + return true; 127 + return rcuref_get(&fph->users); 128 + } 129 + 130 + void futex_private_hash_put(struct futex_private_hash *fph) 131 + { 132 + /* Ignore return value, last put is verified via rcuref_is_dead() */ 133 + if (fph->immutable) 134 + return; 135 + if (rcuref_put(&fph->users)) 136 + wake_up_var(fph->mm); 137 + } 138 + 139 + /** 140 + * futex_hash_get - Get an additional reference for the local hash. 141 + * @hb: ptr to the private local hash. 142 + * 143 + * Obtain an additional reference for the already obtained hash bucket. The 144 + * caller must already own an reference. 145 + */ 146 + void futex_hash_get(struct futex_hash_bucket *hb) 147 + { 148 + struct futex_private_hash *fph = hb->priv; 149 + 150 + if (!fph) 151 + return; 152 + WARN_ON_ONCE(!futex_private_hash_get(fph)); 153 + } 154 + 155 + void futex_hash_put(struct futex_hash_bucket *hb) 156 + { 157 + struct futex_private_hash *fph = hb->priv; 158 + 159 + if (!fph) 160 + return; 161 + futex_private_hash_put(fph); 162 + } 163 + 164 + static struct futex_hash_bucket * 165 + __futex_hash_private(union futex_key *key, struct futex_private_hash *fph) 166 + { 167 + u32 hash; 168 + 169 + if (!futex_key_is_private(key)) 170 + return NULL; 171 + 172 + if (!fph) 173 + fph = rcu_dereference(key->private.mm->futex_phash); 174 + if (!fph || !fph->hash_mask) 175 + return NULL; 176 + 177 + hash = jhash2((void *)&key->private.address, 178 + sizeof(key->private.address) / 4, 179 + key->both.offset); 180 + return &fph->queues[hash & fph->hash_mask]; 181 + } 182 + 183 + static void futex_rehash_private(struct futex_private_hash *old, 184 + struct futex_private_hash *new) 185 + { 186 + struct futex_hash_bucket *hb_old, *hb_new; 187 + unsigned int slots = old->hash_mask + 1; 188 + unsigned int i; 189 + 190 + for (i = 0; i < slots; i++) { 191 + struct futex_q *this, *tmp; 192 + 193 + hb_old = &old->queues[i]; 194 + 195 + spin_lock(&hb_old->lock); 196 + plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) { 197 + 198 + plist_del(&this->list, &hb_old->chain); 199 + futex_hb_waiters_dec(hb_old); 200 + 201 + WARN_ON_ONCE(this->lock_ptr != &hb_old->lock); 202 + 203 + hb_new = __futex_hash(&this->key, new); 204 + futex_hb_waiters_inc(hb_new); 205 + /* 206 + * The new pointer isn't published yet but an already 207 + * moved user can be unqueued due to timeout or signal. 208 + */ 209 + spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING); 210 + plist_add(&this->list, &hb_new->chain); 211 + this->lock_ptr = &hb_new->lock; 212 + spin_unlock(&hb_new->lock); 213 + } 214 + spin_unlock(&hb_old->lock); 215 + } 216 + } 217 + 218 + static bool __futex_pivot_hash(struct mm_struct *mm, 219 + struct futex_private_hash *new) 220 + { 221 + struct futex_private_hash *fph; 222 + 223 + WARN_ON_ONCE(mm->futex_phash_new); 224 + 225 + fph = rcu_dereference_protected(mm->futex_phash, 226 + lockdep_is_held(&mm->futex_hash_lock)); 227 + if (fph) { 228 + if (!rcuref_is_dead(&fph->users)) { 229 + mm->futex_phash_new = new; 230 + return false; 231 + } 232 + 233 + futex_rehash_private(fph, new); 234 + } 235 + rcu_assign_pointer(mm->futex_phash, new); 236 + kvfree_rcu(fph, rcu); 237 + return true; 238 + } 239 + 240 + static void futex_pivot_hash(struct mm_struct *mm) 241 + { 242 + scoped_guard(mutex, &mm->futex_hash_lock) { 243 + struct futex_private_hash *fph; 244 + 245 + fph = mm->futex_phash_new; 246 + if (fph) { 247 + mm->futex_phash_new = NULL; 248 + __futex_pivot_hash(mm, fph); 249 + } 250 + } 251 + } 252 + 253 + struct futex_private_hash *futex_private_hash(void) 254 + { 255 + struct mm_struct *mm = current->mm; 256 + /* 257 + * Ideally we don't loop. If there is a replacement in progress 258 + * then a new private hash is already prepared and a reference can't be 259 + * obtained once the last user dropped it's. 260 + * In that case we block on mm_struct::futex_hash_lock and either have 261 + * to perform the replacement or wait while someone else is doing the 262 + * job. Eitherway, on the second iteration we acquire a reference on the 263 + * new private hash or loop again because a new replacement has been 264 + * requested. 265 + */ 266 + again: 267 + scoped_guard(rcu) { 268 + struct futex_private_hash *fph; 269 + 270 + fph = rcu_dereference(mm->futex_phash); 271 + if (!fph) 272 + return NULL; 273 + 274 + if (fph->immutable) 275 + return fph; 276 + if (rcuref_get(&fph->users)) 277 + return fph; 278 + } 279 + futex_pivot_hash(mm); 280 + goto again; 281 + } 282 + 283 + struct futex_hash_bucket *futex_hash(union futex_key *key) 284 + { 285 + struct futex_private_hash *fph; 286 + struct futex_hash_bucket *hb; 287 + 288 + again: 289 + scoped_guard(rcu) { 290 + hb = __futex_hash(key, NULL); 291 + fph = hb->priv; 292 + 293 + if (!fph || futex_private_hash_get(fph)) 294 + return hb; 295 + } 296 + futex_pivot_hash(key->private.mm); 297 + goto again; 298 + } 299 + 300 + #else /* !CONFIG_FUTEX_PRIVATE_HASH */ 301 + 302 + static struct futex_hash_bucket * 303 + __futex_hash_private(union futex_key *key, struct futex_private_hash *fph) 304 + { 305 + return NULL; 306 + } 307 + 308 + struct futex_hash_bucket *futex_hash(union futex_key *key) 309 + { 310 + return __futex_hash(key, NULL); 311 + } 312 + 313 + #endif /* CONFIG_FUTEX_PRIVATE_HASH */ 314 + 315 + #ifdef CONFIG_FUTEX_MPOL 316 + 317 + static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr) 318 + { 319 + struct vm_area_struct *vma = vma_lookup(mm, addr); 320 + struct mempolicy *mpol; 321 + int node = FUTEX_NO_NODE; 322 + 323 + if (!vma) 324 + return FUTEX_NO_NODE; 325 + 326 + mpol = vma_policy(vma); 327 + if (!mpol) 328 + return FUTEX_NO_NODE; 329 + 330 + switch (mpol->mode) { 331 + case MPOL_PREFERRED: 332 + node = first_node(mpol->nodes); 333 + break; 334 + case MPOL_PREFERRED_MANY: 335 + case MPOL_BIND: 336 + if (mpol->home_node != NUMA_NO_NODE) 337 + node = mpol->home_node; 338 + break; 339 + default: 340 + break; 341 + } 342 + 343 + return node; 344 + } 345 + 346 + static int futex_key_to_node_opt(struct mm_struct *mm, unsigned long addr) 347 + { 348 + int seq, node; 349 + 350 + guard(rcu)(); 351 + 352 + if (!mmap_lock_speculate_try_begin(mm, &seq)) 353 + return -EBUSY; 354 + 355 + node = __futex_key_to_node(mm, addr); 356 + 357 + if (mmap_lock_speculate_retry(mm, seq)) 358 + return -EAGAIN; 359 + 360 + return node; 361 + } 362 + 363 + static int futex_mpol(struct mm_struct *mm, unsigned long addr) 364 + { 365 + int node; 366 + 367 + node = futex_key_to_node_opt(mm, addr); 368 + if (node >= FUTEX_NO_NODE) 369 + return node; 370 + 371 + guard(mmap_read_lock)(mm); 372 + return __futex_key_to_node(mm, addr); 373 + } 374 + 375 + #else /* !CONFIG_FUTEX_MPOL */ 376 + 377 + static int futex_mpol(struct mm_struct *mm, unsigned long addr) 378 + { 379 + return FUTEX_NO_NODE; 380 + } 381 + 382 + #endif /* CONFIG_FUTEX_MPOL */ 383 + 384 + /** 385 + * __futex_hash - Return the hash bucket 386 + * @key: Pointer to the futex key for which the hash is calculated 387 + * @fph: Pointer to private hash if known 388 + * 389 + * We hash on the keys returned from get_futex_key (see below) and return the 390 + * corresponding hash bucket. 391 + * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the 392 + * private hash) is returned if existing. Otherwise a hash bucket from the 393 + * global hash is returned. 394 + */ 395 + static struct futex_hash_bucket * 396 + __futex_hash(union futex_key *key, struct futex_private_hash *fph) 397 + { 398 + int node = key->both.node; 399 + u32 hash; 400 + 401 + if (node == FUTEX_NO_NODE) { 402 + struct futex_hash_bucket *hb; 403 + 404 + hb = __futex_hash_private(key, fph); 405 + if (hb) 406 + return hb; 407 + } 408 + 409 + hash = jhash2((u32 *)key, 410 + offsetof(typeof(*key), both.offset) / sizeof(u32), 411 + key->both.offset); 412 + 413 + if (node == FUTEX_NO_NODE) { 414 + /* 415 + * In case of !FLAGS_NUMA, use some unused hash bits to pick a 416 + * node -- this ensures regular futexes are interleaved across 417 + * the nodes and avoids having to allocate multiple 418 + * hash-tables. 419 + * 420 + * NOTE: this isn't perfectly uniform, but it is fast and 421 + * handles sparse node masks. 422 + */ 423 + node = (hash >> futex_hashshift) % nr_node_ids; 424 + if (!node_possible(node)) { 425 + node = find_next_bit_wrap(node_possible_map.bits, 426 + nr_node_ids, node); 427 + } 428 + } 429 + 430 + return &futex_queues[node][hash & futex_hashmask]; 431 + } 143 432 144 433 /** 145 434 * futex_setup_timer - set up the sleeping hrtimer. ··· 552 227 struct page *page; 553 228 struct folio *folio; 554 229 struct address_space *mapping; 555 - int err, ro = 0; 230 + int node, err, size, ro = 0; 231 + bool node_updated = false; 556 232 bool fshared; 557 233 558 234 fshared = flags & FLAGS_SHARED; 235 + size = futex_size(flags); 236 + if (flags & FLAGS_NUMA) 237 + size *= 2; 559 238 560 239 /* 561 240 * The futex address must be "naturally" aligned. 562 241 */ 563 242 key->both.offset = address % PAGE_SIZE; 564 - if (unlikely((address % sizeof(u32)) != 0)) 243 + if (unlikely((address % size) != 0)) 565 244 return -EINVAL; 566 245 address -= key->both.offset; 567 246 568 - if (unlikely(!access_ok(uaddr, sizeof(u32)))) 247 + if (unlikely(!access_ok(uaddr, size))) 569 248 return -EFAULT; 570 249 571 250 if (unlikely(should_fail_futex(fshared))) 572 251 return -EFAULT; 252 + 253 + node = FUTEX_NO_NODE; 254 + 255 + if (flags & FLAGS_NUMA) { 256 + u32 __user *naddr = (void *)uaddr + size / 2; 257 + 258 + if (futex_get_value(&node, naddr)) 259 + return -EFAULT; 260 + 261 + if (node != FUTEX_NO_NODE && 262 + (node >= MAX_NUMNODES || !node_possible(node))) 263 + return -EINVAL; 264 + } 265 + 266 + if (node == FUTEX_NO_NODE && (flags & FLAGS_MPOL)) { 267 + node = futex_mpol(mm, address); 268 + node_updated = true; 269 + } 270 + 271 + if (flags & FLAGS_NUMA) { 272 + u32 __user *naddr = (void *)uaddr + size / 2; 273 + 274 + if (node == FUTEX_NO_NODE) { 275 + node = numa_node_id(); 276 + node_updated = true; 277 + } 278 + if (node_updated && futex_put_value(node, naddr)) 279 + return -EFAULT; 280 + } 281 + 282 + key->both.node = node; 573 283 574 284 /* 575 285 * PROCESS_PRIVATE futexes are fast. ··· 862 502 } 863 503 864 504 /* The key must be already stored in q->key. */ 865 - struct futex_hash_bucket *futex_q_lock(struct futex_q *q) 505 + void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb) 866 506 __acquires(&hb->lock) 867 507 { 868 - struct futex_hash_bucket *hb; 869 - 870 - hb = futex_hash(&q->key); 871 - 872 508 /* 873 509 * Increment the counter before taking the lock so that 874 510 * a potential waker won't miss a to-be-slept task that is ··· 878 522 q->lock_ptr = &hb->lock; 879 523 880 524 spin_lock(&hb->lock); 881 - return hb; 882 525 } 883 526 884 527 void futex_q_unlock(struct futex_hash_bucket *hb) 885 528 __releases(&hb->lock) 886 529 { 887 - spin_unlock(&hb->lock); 888 530 futex_hb_waiters_dec(hb); 531 + spin_unlock(&hb->lock); 889 532 } 890 533 891 534 void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, ··· 923 568 spinlock_t *lock_ptr; 924 569 int ret = 0; 925 570 571 + /* RCU so lock_ptr is not going away during locking. */ 572 + guard(rcu)(); 926 573 /* In the common case we don't take the spinlock, which is nice. */ 927 574 retry: 928 575 /* ··· 961 604 } 962 605 963 606 return ret; 607 + } 608 + 609 + void futex_q_lockptr_lock(struct futex_q *q) 610 + { 611 + spinlock_t *lock_ptr; 612 + 613 + /* 614 + * See futex_unqueue() why lock_ptr can change. 615 + */ 616 + guard(rcu)(); 617 + retry: 618 + lock_ptr = READ_ONCE(q->lock_ptr); 619 + spin_lock(lock_ptr); 620 + 621 + if (unlikely(lock_ptr != q->lock_ptr)) { 622 + spin_unlock(lock_ptr); 623 + goto retry; 624 + } 964 625 } 965 626 966 627 /* ··· 1324 949 { 1325 950 struct list_head *next, *head = &curr->pi_state_list; 1326 951 struct futex_pi_state *pi_state; 1327 - struct futex_hash_bucket *hb; 1328 952 union futex_key key = FUTEX_KEY_INIT; 1329 953 954 + /* 955 + * The mutex mm_struct::futex_hash_lock might be acquired. 956 + */ 957 + might_sleep(); 958 + /* 959 + * Ensure the hash remains stable (no resize) during the while loop 960 + * below. The hb pointer is acquired under the pi_lock so we can't block 961 + * on the mutex. 962 + */ 963 + WARN_ON(curr != current); 964 + guard(private_hash)(); 1330 965 /* 1331 966 * We are a ZOMBIE and nobody can enqueue itself on 1332 967 * pi_state_list anymore, but we have to be careful ··· 1347 962 next = head->next; 1348 963 pi_state = list_entry(next, struct futex_pi_state, list); 1349 964 key = pi_state->key; 1350 - hb = futex_hash(&key); 965 + if (1) { 966 + CLASS(hb, hb)(&key); 1351 967 1352 - /* 1353 - * We can race against put_pi_state() removing itself from the 1354 - * list (a waiter going away). put_pi_state() will first 1355 - * decrement the reference count and then modify the list, so 1356 - * its possible to see the list entry but fail this reference 1357 - * acquire. 1358 - * 1359 - * In that case; drop the locks to let put_pi_state() make 1360 - * progress and retry the loop. 1361 - */ 1362 - if (!refcount_inc_not_zero(&pi_state->refcount)) { 968 + /* 969 + * We can race against put_pi_state() removing itself from the 970 + * list (a waiter going away). put_pi_state() will first 971 + * decrement the reference count and then modify the list, so 972 + * its possible to see the list entry but fail this reference 973 + * acquire. 974 + * 975 + * In that case; drop the locks to let put_pi_state() make 976 + * progress and retry the loop. 977 + */ 978 + if (!refcount_inc_not_zero(&pi_state->refcount)) { 979 + raw_spin_unlock_irq(&curr->pi_lock); 980 + cpu_relax(); 981 + raw_spin_lock_irq(&curr->pi_lock); 982 + continue; 983 + } 1363 984 raw_spin_unlock_irq(&curr->pi_lock); 1364 - cpu_relax(); 1365 - raw_spin_lock_irq(&curr->pi_lock); 1366 - continue; 1367 - } 1368 - raw_spin_unlock_irq(&curr->pi_lock); 1369 985 1370 - spin_lock(&hb->lock); 1371 - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 1372 - raw_spin_lock(&curr->pi_lock); 1373 - /* 1374 - * We dropped the pi-lock, so re-check whether this 1375 - * task still owns the PI-state: 1376 - */ 1377 - if (head->next != next) { 1378 - /* retain curr->pi_lock for the loop invariant */ 1379 - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); 986 + spin_lock(&hb->lock); 987 + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 988 + raw_spin_lock(&curr->pi_lock); 989 + /* 990 + * We dropped the pi-lock, so re-check whether this 991 + * task still owns the PI-state: 992 + */ 993 + if (head->next != next) { 994 + /* retain curr->pi_lock for the loop invariant */ 995 + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); 996 + spin_unlock(&hb->lock); 997 + put_pi_state(pi_state); 998 + continue; 999 + } 1000 + 1001 + WARN_ON(pi_state->owner != curr); 1002 + WARN_ON(list_empty(&pi_state->list)); 1003 + list_del_init(&pi_state->list); 1004 + pi_state->owner = NULL; 1005 + 1006 + raw_spin_unlock(&curr->pi_lock); 1007 + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1380 1008 spin_unlock(&hb->lock); 1381 - put_pi_state(pi_state); 1382 - continue; 1383 1009 } 1384 - 1385 - WARN_ON(pi_state->owner != curr); 1386 - WARN_ON(list_empty(&pi_state->list)); 1387 - list_del_init(&pi_state->list); 1388 - pi_state->owner = NULL; 1389 - 1390 - raw_spin_unlock(&curr->pi_lock); 1391 - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1392 - spin_unlock(&hb->lock); 1393 1010 1394 1011 rt_mutex_futex_unlock(&pi_state->pi_mutex); 1395 1012 put_pi_state(pi_state); ··· 1512 1125 futex_cleanup_end(tsk, FUTEX_STATE_DEAD); 1513 1126 } 1514 1127 1128 + static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, 1129 + struct futex_private_hash *fph) 1130 + { 1131 + #ifdef CONFIG_FUTEX_PRIVATE_HASH 1132 + fhb->priv = fph; 1133 + #endif 1134 + atomic_set(&fhb->waiters, 0); 1135 + plist_head_init(&fhb->chain); 1136 + spin_lock_init(&fhb->lock); 1137 + } 1138 + 1139 + #define FH_CUSTOM 0x01 1140 + #define FH_IMMUTABLE 0x02 1141 + 1142 + #ifdef CONFIG_FUTEX_PRIVATE_HASH 1143 + void futex_hash_free(struct mm_struct *mm) 1144 + { 1145 + struct futex_private_hash *fph; 1146 + 1147 + kvfree(mm->futex_phash_new); 1148 + fph = rcu_dereference_raw(mm->futex_phash); 1149 + if (fph) { 1150 + WARN_ON_ONCE(rcuref_read(&fph->users) > 1); 1151 + kvfree(fph); 1152 + } 1153 + } 1154 + 1155 + static bool futex_pivot_pending(struct mm_struct *mm) 1156 + { 1157 + struct futex_private_hash *fph; 1158 + 1159 + guard(rcu)(); 1160 + 1161 + if (!mm->futex_phash_new) 1162 + return true; 1163 + 1164 + fph = rcu_dereference(mm->futex_phash); 1165 + return rcuref_is_dead(&fph->users); 1166 + } 1167 + 1168 + static bool futex_hash_less(struct futex_private_hash *a, 1169 + struct futex_private_hash *b) 1170 + { 1171 + /* user provided always wins */ 1172 + if (!a->custom && b->custom) 1173 + return true; 1174 + if (a->custom && !b->custom) 1175 + return false; 1176 + 1177 + /* zero-sized hash wins */ 1178 + if (!b->hash_mask) 1179 + return true; 1180 + if (!a->hash_mask) 1181 + return false; 1182 + 1183 + /* keep the biggest */ 1184 + if (a->hash_mask < b->hash_mask) 1185 + return true; 1186 + if (a->hash_mask > b->hash_mask) 1187 + return false; 1188 + 1189 + return false; /* equal */ 1190 + } 1191 + 1192 + static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) 1193 + { 1194 + struct mm_struct *mm = current->mm; 1195 + struct futex_private_hash *fph; 1196 + bool custom = flags & FH_CUSTOM; 1197 + int i; 1198 + 1199 + if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots))) 1200 + return -EINVAL; 1201 + 1202 + /* 1203 + * Once we've disabled the global hash there is no way back. 1204 + */ 1205 + scoped_guard(rcu) { 1206 + fph = rcu_dereference(mm->futex_phash); 1207 + if (fph && (!fph->hash_mask || fph->immutable)) { 1208 + if (custom) 1209 + return -EBUSY; 1210 + return 0; 1211 + } 1212 + } 1213 + 1214 + fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 1215 + if (!fph) 1216 + return -ENOMEM; 1217 + 1218 + rcuref_init(&fph->users, 1); 1219 + fph->hash_mask = hash_slots ? hash_slots - 1 : 0; 1220 + fph->custom = custom; 1221 + fph->immutable = !!(flags & FH_IMMUTABLE); 1222 + fph->mm = mm; 1223 + 1224 + for (i = 0; i < hash_slots; i++) 1225 + futex_hash_bucket_init(&fph->queues[i], fph); 1226 + 1227 + if (custom) { 1228 + /* 1229 + * Only let prctl() wait / retry; don't unduly delay clone(). 1230 + */ 1231 + again: 1232 + wait_var_event(mm, futex_pivot_pending(mm)); 1233 + } 1234 + 1235 + scoped_guard(mutex, &mm->futex_hash_lock) { 1236 + struct futex_private_hash *free __free(kvfree) = NULL; 1237 + struct futex_private_hash *cur, *new; 1238 + 1239 + cur = rcu_dereference_protected(mm->futex_phash, 1240 + lockdep_is_held(&mm->futex_hash_lock)); 1241 + new = mm->futex_phash_new; 1242 + mm->futex_phash_new = NULL; 1243 + 1244 + if (fph) { 1245 + if (cur && !new) { 1246 + /* 1247 + * If we have an existing hash, but do not yet have 1248 + * allocated a replacement hash, drop the initial 1249 + * reference on the existing hash. 1250 + */ 1251 + futex_private_hash_put(cur); 1252 + } 1253 + 1254 + if (new) { 1255 + /* 1256 + * Two updates raced; throw out the lesser one. 1257 + */ 1258 + if (futex_hash_less(new, fph)) { 1259 + free = new; 1260 + new = fph; 1261 + } else { 1262 + free = fph; 1263 + } 1264 + } else { 1265 + new = fph; 1266 + } 1267 + fph = NULL; 1268 + } 1269 + 1270 + if (new) { 1271 + /* 1272 + * Will set mm->futex_phash_new on failure; 1273 + * futex_private_hash_get() will try again. 1274 + */ 1275 + if (!__futex_pivot_hash(mm, new) && custom) 1276 + goto again; 1277 + } 1278 + } 1279 + return 0; 1280 + } 1281 + 1282 + int futex_hash_allocate_default(void) 1283 + { 1284 + unsigned int threads, buckets, current_buckets = 0; 1285 + struct futex_private_hash *fph; 1286 + 1287 + if (!current->mm) 1288 + return 0; 1289 + 1290 + scoped_guard(rcu) { 1291 + threads = min_t(unsigned int, 1292 + get_nr_threads(current), 1293 + num_online_cpus()); 1294 + 1295 + fph = rcu_dereference(current->mm->futex_phash); 1296 + if (fph) { 1297 + if (fph->custom) 1298 + return 0; 1299 + 1300 + current_buckets = fph->hash_mask + 1; 1301 + } 1302 + } 1303 + 1304 + /* 1305 + * The default allocation will remain within 1306 + * 16 <= threads * 4 <= global hash size 1307 + */ 1308 + buckets = roundup_pow_of_two(4 * threads); 1309 + buckets = clamp(buckets, 16, futex_hashmask + 1); 1310 + 1311 + if (current_buckets >= buckets) 1312 + return 0; 1313 + 1314 + return futex_hash_allocate(buckets, 0); 1315 + } 1316 + 1317 + static int futex_hash_get_slots(void) 1318 + { 1319 + struct futex_private_hash *fph; 1320 + 1321 + guard(rcu)(); 1322 + fph = rcu_dereference(current->mm->futex_phash); 1323 + if (fph && fph->hash_mask) 1324 + return fph->hash_mask + 1; 1325 + return 0; 1326 + } 1327 + 1328 + static int futex_hash_get_immutable(void) 1329 + { 1330 + struct futex_private_hash *fph; 1331 + 1332 + guard(rcu)(); 1333 + fph = rcu_dereference(current->mm->futex_phash); 1334 + if (fph && fph->immutable) 1335 + return 1; 1336 + if (fph && !fph->hash_mask) 1337 + return 1; 1338 + return 0; 1339 + } 1340 + 1341 + #else 1342 + 1343 + static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) 1344 + { 1345 + return -EINVAL; 1346 + } 1347 + 1348 + static int futex_hash_get_slots(void) 1349 + { 1350 + return 0; 1351 + } 1352 + 1353 + static int futex_hash_get_immutable(void) 1354 + { 1355 + return 0; 1356 + } 1357 + #endif 1358 + 1359 + int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) 1360 + { 1361 + unsigned int flags = FH_CUSTOM; 1362 + int ret; 1363 + 1364 + switch (arg2) { 1365 + case PR_FUTEX_HASH_SET_SLOTS: 1366 + if (arg4 & ~FH_FLAG_IMMUTABLE) 1367 + return -EINVAL; 1368 + if (arg4 & FH_FLAG_IMMUTABLE) 1369 + flags |= FH_IMMUTABLE; 1370 + ret = futex_hash_allocate(arg3, flags); 1371 + break; 1372 + 1373 + case PR_FUTEX_HASH_GET_SLOTS: 1374 + ret = futex_hash_get_slots(); 1375 + break; 1376 + 1377 + case PR_FUTEX_HASH_GET_IMMUTABLE: 1378 + ret = futex_hash_get_immutable(); 1379 + break; 1380 + 1381 + default: 1382 + ret = -EINVAL; 1383 + break; 1384 + } 1385 + return ret; 1386 + } 1387 + 1515 1388 static int __init futex_init(void) 1516 1389 { 1517 1390 unsigned long hashsize, i; 1518 - unsigned int futex_shift; 1391 + unsigned int order, n; 1392 + unsigned long size; 1519 1393 1520 1394 #ifdef CONFIG_BASE_SMALL 1521 1395 hashsize = 16; 1522 1396 #else 1523 - hashsize = roundup_pow_of_two(256 * num_possible_cpus()); 1397 + hashsize = 256 * num_possible_cpus(); 1398 + hashsize /= num_possible_nodes(); 1399 + hashsize = max(4, hashsize); 1400 + hashsize = roundup_pow_of_two(hashsize); 1524 1401 #endif 1402 + futex_hashshift = ilog2(hashsize); 1403 + size = sizeof(struct futex_hash_bucket) * hashsize; 1404 + order = get_order(size); 1525 1405 1526 - futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), 1527 - hashsize, 0, 0, 1528 - &futex_shift, NULL, 1529 - hashsize, hashsize); 1530 - hashsize = 1UL << futex_shift; 1406 + for_each_node(n) { 1407 + struct futex_hash_bucket *table; 1531 1408 1532 - for (i = 0; i < hashsize; i++) { 1533 - atomic_set(&futex_queues[i].waiters, 0); 1534 - plist_head_init(&futex_queues[i].chain); 1535 - spin_lock_init(&futex_queues[i].lock); 1409 + if (order > MAX_PAGE_ORDER) 1410 + table = vmalloc_huge_node(size, GFP_KERNEL, n); 1411 + else 1412 + table = alloc_pages_exact_nid(n, size, GFP_KERNEL); 1413 + 1414 + BUG_ON(!table); 1415 + 1416 + for (i = 0; i < hashsize; i++) 1417 + futex_hash_bucket_init(&table[i], NULL); 1418 + 1419 + futex_queues[n] = table; 1536 1420 } 1537 1421 1538 1422 futex_hashmask = hashsize - 1; 1423 + pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n", 1424 + hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024, 1425 + order > MAX_PAGE_ORDER ? "vmalloc" : "linear"); 1539 1426 return 0; 1540 1427 } 1541 1428 core_initcall(futex_init);

+66 -8

kernel/futex/futex.h

··· 7 7 #include <linux/sched/wake_q.h> 8 8 #include <linux/compat.h> 9 9 #include <linux/uaccess.h> 10 + #include <linux/cleanup.h> 10 11 11 12 #ifdef CONFIG_PREEMPT_RT 12 13 #include <linux/rcuwait.h> ··· 39 38 #define FLAGS_HAS_TIMEOUT 0x0040 40 39 #define FLAGS_NUMA 0x0080 41 40 #define FLAGS_STRICT 0x0100 41 + #define FLAGS_MPOL 0x0200 42 42 43 43 /* FUTEX_ to FLAGS_ */ 44 44 static inline unsigned int futex_to_flags(unsigned int op) ··· 55 53 return flags; 56 54 } 57 55 58 - #define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE) 56 + #define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_MPOL | FUTEX2_PRIVATE) 59 57 60 58 /* FUTEX2_ to FLAGS_ */ 61 59 static inline unsigned int futex2_to_flags(unsigned int flags2) ··· 67 65 68 66 if (flags2 & FUTEX2_NUMA) 69 67 flags |= FLAGS_NUMA; 68 + 69 + if (flags2 & FUTEX2_MPOL) 70 + flags |= FLAGS_MPOL; 70 71 71 72 return flags; 72 73 } ··· 90 85 /* Only 32bit futexes are implemented -- for now */ 91 86 if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32) 92 87 return false; 88 + 89 + /* 90 + * Must be able to represent both FUTEX_NO_NODE and every valid nodeid 91 + * in a futex word. 92 + */ 93 + if (flags & FLAGS_NUMA) { 94 + int bits = 8 * futex_size(flags); 95 + u64 max = ~0ULL; 96 + 97 + max >>= 64 - bits; 98 + if (nr_node_ids >= max) 99 + return false; 100 + } 93 101 94 102 return true; 95 103 } ··· 135 117 atomic_t waiters; 136 118 spinlock_t lock; 137 119 struct plist_head chain; 120 + struct futex_private_hash *priv; 138 121 } ____cacheline_aligned_in_smp; 139 122 140 123 /* ··· 175 156 * @requeue_pi_key: the requeue_pi target futex key 176 157 * @bitset: bitset for the optional bitmasked wakeup 177 158 * @requeue_state: State field for futex_requeue_pi() 159 + * @drop_hb_ref: Waiter should drop the extra hash bucket reference if true 178 160 * @requeue_wait: RCU wait for futex_requeue_pi() (RT only) 179 161 * 180 162 * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so ··· 202 182 union futex_key *requeue_pi_key; 203 183 u32 bitset; 204 184 atomic_t requeue_state; 185 + bool drop_hb_ref; 205 186 #ifdef CONFIG_PREEMPT_RT 206 187 struct rcuwait requeue_wait; 207 188 #endif ··· 217 196 218 197 extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, 219 198 enum futex_access rw); 220 - 199 + extern void futex_q_lockptr_lock(struct futex_q *q); 221 200 extern struct hrtimer_sleeper * 222 201 futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, 223 202 int flags, u64 range_ns); 224 203 225 204 extern struct futex_hash_bucket *futex_hash(union futex_key *key); 205 + #ifdef CONFIG_FUTEX_PRIVATE_HASH 206 + extern void futex_hash_get(struct futex_hash_bucket *hb); 207 + extern void futex_hash_put(struct futex_hash_bucket *hb); 208 + 209 + extern struct futex_private_hash *futex_private_hash(void); 210 + extern bool futex_private_hash_get(struct futex_private_hash *fph); 211 + extern void futex_private_hash_put(struct futex_private_hash *fph); 212 + 213 + #else /* !CONFIG_FUTEX_PRIVATE_HASH */ 214 + static inline void futex_hash_get(struct futex_hash_bucket *hb) { } 215 + static inline void futex_hash_put(struct futex_hash_bucket *hb) { } 216 + static inline struct futex_private_hash *futex_private_hash(void) { return NULL; } 217 + static inline bool futex_private_hash_get(void) { return false; } 218 + static inline void futex_private_hash_put(struct futex_private_hash *fph) { } 219 + #endif 220 + 221 + DEFINE_CLASS(hb, struct futex_hash_bucket *, 222 + if (_T) futex_hash_put(_T), 223 + futex_hash(key), union futex_key *key); 224 + 225 + DEFINE_CLASS(private_hash, struct futex_private_hash *, 226 + if (_T) futex_private_hash_put(_T), 227 + futex_private_hash(), void); 226 228 227 229 /** 228 230 * futex_match - Check whether two futex keys are equal ··· 263 219 } 264 220 265 221 extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, 266 - struct futex_q *q, struct futex_hash_bucket **hb); 267 - extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, 268 - struct hrtimer_sleeper *timeout); 222 + struct futex_q *q, union futex_key *key2, 223 + struct task_struct *task); 224 + extern void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout); 269 225 extern bool __futex_wake_mark(struct futex_q *q); 270 226 extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q); 271 227 ··· 300 256 * This looks a bit overkill, but generally just results in a couple 301 257 * of instructions. 302 258 */ 303 - static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from) 259 + static __always_inline int futex_get_value(u32 *dest, u32 __user *from) 304 260 { 305 261 u32 val; 306 262 ··· 317 273 return -EFAULT; 318 274 } 319 275 276 + static __always_inline int futex_put_value(u32 val, u32 __user *to) 277 + { 278 + if (can_do_masked_user_access()) 279 + to = masked_user_access_begin(to); 280 + else if (!user_read_access_begin(to, sizeof(*to))) 281 + return -EFAULT; 282 + unsafe_put_user(val, to, Efault); 283 + user_read_access_end(); 284 + return 0; 285 + Efault: 286 + user_read_access_end(); 287 + return -EFAULT; 288 + } 289 + 320 290 static inline int futex_get_value_locked(u32 *dest, u32 __user *from) 321 291 { 322 292 int ret; 323 293 324 294 pagefault_disable(); 325 - ret = futex_read_inatomic(dest, from); 295 + ret = futex_get_value(dest, from); 326 296 pagefault_enable(); 327 297 328 298 return ret; ··· 412 354 #endif 413 355 } 414 356 415 - extern struct futex_hash_bucket *futex_q_lock(struct futex_q *q); 357 + extern void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb); 416 358 extern void futex_q_unlock(struct futex_hash_bucket *hb); 417 359 418 360

+165 -141

kernel/futex/pi.c

··· 806 806 break; 807 807 } 808 808 809 - spin_lock(q->lock_ptr); 809 + futex_q_lockptr_lock(q); 810 810 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 811 811 812 812 /* ··· 920 920 struct hrtimer_sleeper timeout, *to; 921 921 struct task_struct *exiting = NULL; 922 922 struct rt_mutex_waiter rt_waiter; 923 - struct futex_hash_bucket *hb; 924 923 struct futex_q q = futex_q_init; 925 924 DEFINE_WAKE_Q(wake_q); 926 925 int res, ret; ··· 938 939 goto out; 939 940 940 941 retry_private: 941 - hb = futex_q_lock(&q); 942 + if (1) { 943 + CLASS(hb, hb)(&q.key); 942 944 943 - ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 944 - &exiting, 0); 945 - if (unlikely(ret)) { 946 - /* 947 - * Atomic work succeeded and we got the lock, 948 - * or failed. Either way, we do _not_ block. 949 - */ 950 - switch (ret) { 951 - case 1: 952 - /* We got the lock. */ 953 - ret = 0; 954 - goto out_unlock_put_key; 955 - case -EFAULT: 956 - goto uaddr_faulted; 957 - case -EBUSY: 958 - case -EAGAIN: 945 + futex_q_lock(&q, hb); 946 + 947 + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 948 + &exiting, 0); 949 + if (unlikely(ret)) { 959 950 /* 960 - * Two reasons for this: 961 - * - EBUSY: Task is exiting and we just wait for the 962 - * exit to complete. 963 - * - EAGAIN: The user space value changed. 951 + * Atomic work succeeded and we got the lock, 952 + * or failed. Either way, we do _not_ block. 964 953 */ 965 - futex_q_unlock(hb); 966 - /* 967 - * Handle the case where the owner is in the middle of 968 - * exiting. Wait for the exit to complete otherwise 969 - * this task might loop forever, aka. live lock. 970 - */ 971 - wait_for_owner_exiting(ret, exiting); 972 - cond_resched(); 973 - goto retry; 974 - default: 975 - goto out_unlock_put_key; 954 + switch (ret) { 955 + case 1: 956 + /* We got the lock. */ 957 + ret = 0; 958 + goto out_unlock_put_key; 959 + case -EFAULT: 960 + goto uaddr_faulted; 961 + case -EBUSY: 962 + case -EAGAIN: 963 + /* 964 + * Two reasons for this: 965 + * - EBUSY: Task is exiting and we just wait for the 966 + * exit to complete. 967 + * - EAGAIN: The user space value changed. 968 + */ 969 + futex_q_unlock(hb); 970 + /* 971 + * Handle the case where the owner is in the middle of 972 + * exiting. Wait for the exit to complete otherwise 973 + * this task might loop forever, aka. live lock. 974 + */ 975 + wait_for_owner_exiting(ret, exiting); 976 + cond_resched(); 977 + goto retry; 978 + default: 979 + goto out_unlock_put_key; 980 + } 976 981 } 977 - } 978 982 979 - WARN_ON(!q.pi_state); 983 + WARN_ON(!q.pi_state); 980 984 981 - /* 982 - * Only actually queue now that the atomic ops are done: 983 - */ 984 - __futex_queue(&q, hb, current); 985 + /* 986 + * Only actually queue now that the atomic ops are done: 987 + */ 988 + __futex_queue(&q, hb, current); 985 989 986 - if (trylock) { 987 - ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); 988 - /* Fixup the trylock return value: */ 989 - ret = ret ? 0 : -EWOULDBLOCK; 990 - goto no_block; 991 - } 990 + if (trylock) { 991 + ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); 992 + /* Fixup the trylock return value: */ 993 + ret = ret ? 0 : -EWOULDBLOCK; 994 + goto no_block; 995 + } 992 996 993 - /* 994 - * Must be done before we enqueue the waiter, here is unfortunately 995 - * under the hb lock, but that *should* work because it does nothing. 996 - */ 997 - rt_mutex_pre_schedule(); 997 + /* 998 + * Caution; releasing @hb in-scope. The hb->lock is still locked 999 + * while the reference is dropped. The reference can not be dropped 1000 + * after the unlock because if a user initiated resize is in progress 1001 + * then we might need to wake him. This can not be done after the 1002 + * rt_mutex_pre_schedule() invocation. The hb will remain valid because 1003 + * the thread, performing resize, will block on hb->lock during 1004 + * the requeue. 1005 + */ 1006 + futex_hash_put(no_free_ptr(hb)); 1007 + /* 1008 + * Must be done before we enqueue the waiter, here is unfortunately 1009 + * under the hb lock, but that *should* work because it does nothing. 1010 + */ 1011 + rt_mutex_pre_schedule(); 998 1012 999 - rt_mutex_init_waiter(&rt_waiter); 1013 + rt_mutex_init_waiter(&rt_waiter); 1000 1014 1001 - /* 1002 - * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not 1003 - * hold it while doing rt_mutex_start_proxy(), because then it will 1004 - * include hb->lock in the blocking chain, even through we'll not in 1005 - * fact hold it while blocking. This will lead it to report -EDEADLK 1006 - * and BUG when futex_unlock_pi() interleaves with this. 1007 - * 1008 - * Therefore acquire wait_lock while holding hb->lock, but drop the 1009 - * latter before calling __rt_mutex_start_proxy_lock(). This 1010 - * interleaves with futex_unlock_pi() -- which does a similar lock 1011 - * handoff -- such that the latter can observe the futex_q::pi_state 1012 - * before __rt_mutex_start_proxy_lock() is done. 1013 - */ 1014 - raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); 1015 - spin_unlock(q.lock_ptr); 1016 - /* 1017 - * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter 1018 - * such that futex_unlock_pi() is guaranteed to observe the waiter when 1019 - * it sees the futex_q::pi_state. 1020 - */ 1021 - ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); 1022 - raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q); 1015 + /* 1016 + * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not 1017 + * hold it while doing rt_mutex_start_proxy(), because then it will 1018 + * include hb->lock in the blocking chain, even through we'll not in 1019 + * fact hold it while blocking. This will lead it to report -EDEADLK 1020 + * and BUG when futex_unlock_pi() interleaves with this. 1021 + * 1022 + * Therefore acquire wait_lock while holding hb->lock, but drop the 1023 + * latter before calling __rt_mutex_start_proxy_lock(). This 1024 + * interleaves with futex_unlock_pi() -- which does a similar lock 1025 + * handoff -- such that the latter can observe the futex_q::pi_state 1026 + * before __rt_mutex_start_proxy_lock() is done. 1027 + */ 1028 + raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); 1029 + spin_unlock(q.lock_ptr); 1030 + /* 1031 + * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter 1032 + * such that futex_unlock_pi() is guaranteed to observe the waiter when 1033 + * it sees the futex_q::pi_state. 1034 + */ 1035 + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); 1036 + raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q); 1023 1037 1024 - if (ret) { 1025 - if (ret == 1) 1026 - ret = 0; 1027 - goto cleanup; 1028 - } 1038 + if (ret) { 1039 + if (ret == 1) 1040 + ret = 0; 1041 + goto cleanup; 1042 + } 1029 1043 1030 - if (unlikely(to)) 1031 - hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); 1044 + if (unlikely(to)) 1045 + hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); 1032 1046 1033 - ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); 1047 + ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); 1034 1048 1035 1049 cleanup: 1036 - /* 1037 - * If we failed to acquire the lock (deadlock/signal/timeout), we must 1038 - * must unwind the above, however we canont lock hb->lock because 1039 - * rt_mutex already has a waiter enqueued and hb->lock can itself try 1040 - * and enqueue an rt_waiter through rtlock. 1041 - * 1042 - * Doing the cleanup without holding hb->lock can cause inconsistent 1043 - * state between hb and pi_state, but only in the direction of not 1044 - * seeing a waiter that is leaving. 1045 - * 1046 - * See futex_unlock_pi(), it deals with this inconsistency. 1047 - * 1048 - * There be dragons here, since we must deal with the inconsistency on 1049 - * the way out (here), it is impossible to detect/warn about the race 1050 - * the other way around (missing an incoming waiter). 1051 - * 1052 - * What could possibly go wrong... 1053 - */ 1054 - if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) 1055 - ret = 0; 1050 + /* 1051 + * If we failed to acquire the lock (deadlock/signal/timeout), we must 1052 + * unwind the above, however we canont lock hb->lock because 1053 + * rt_mutex already has a waiter enqueued and hb->lock can itself try 1054 + * and enqueue an rt_waiter through rtlock. 1055 + * 1056 + * Doing the cleanup without holding hb->lock can cause inconsistent 1057 + * state between hb and pi_state, but only in the direction of not 1058 + * seeing a waiter that is leaving. 1059 + * 1060 + * See futex_unlock_pi(), it deals with this inconsistency. 1061 + * 1062 + * There be dragons here, since we must deal with the inconsistency on 1063 + * the way out (here), it is impossible to detect/warn about the race 1064 + * the other way around (missing an incoming waiter). 1065 + * 1066 + * What could possibly go wrong... 1067 + */ 1068 + if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) 1069 + ret = 0; 1056 1070 1057 - /* 1058 - * Now that the rt_waiter has been dequeued, it is safe to use 1059 - * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up 1060 - * the 1061 - */ 1062 - spin_lock(q.lock_ptr); 1063 - /* 1064 - * Waiter is unqueued. 1065 - */ 1066 - rt_mutex_post_schedule(); 1071 + /* 1072 + * Now that the rt_waiter has been dequeued, it is safe to use 1073 + * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up 1074 + * the 1075 + */ 1076 + futex_q_lockptr_lock(&q); 1077 + /* 1078 + * Waiter is unqueued. 1079 + */ 1080 + rt_mutex_post_schedule(); 1067 1081 no_block: 1068 - /* 1069 - * Fixup the pi_state owner and possibly acquire the lock if we 1070 - * haven't already. 1071 - */ 1072 - res = fixup_pi_owner(uaddr, &q, !ret); 1073 - /* 1074 - * If fixup_pi_owner() returned an error, propagate that. If it acquired 1075 - * the lock, clear our -ETIMEDOUT or -EINTR. 1076 - */ 1077 - if (res) 1078 - ret = (res < 0) ? res : 0; 1082 + /* 1083 + * Fixup the pi_state owner and possibly acquire the lock if we 1084 + * haven't already. 1085 + */ 1086 + res = fixup_pi_owner(uaddr, &q, !ret); 1087 + /* 1088 + * If fixup_pi_owner() returned an error, propagate that. If it acquired 1089 + * the lock, clear our -ETIMEDOUT or -EINTR. 1090 + */ 1091 + if (res) 1092 + ret = (res < 0) ? res : 0; 1079 1093 1080 - futex_unqueue_pi(&q); 1081 - spin_unlock(q.lock_ptr); 1082 - goto out; 1094 + futex_unqueue_pi(&q); 1095 + spin_unlock(q.lock_ptr); 1096 + if (q.drop_hb_ref) { 1097 + CLASS(hb, hb)(&q.key); 1098 + /* Additional reference from futex_unlock_pi() */ 1099 + futex_hash_put(hb); 1100 + } 1101 + goto out; 1083 1102 1084 1103 out_unlock_put_key: 1085 - futex_q_unlock(hb); 1104 + futex_q_unlock(hb); 1105 + goto out; 1106 + 1107 + uaddr_faulted: 1108 + futex_q_unlock(hb); 1109 + 1110 + ret = fault_in_user_writeable(uaddr); 1111 + if (ret) 1112 + goto out; 1113 + 1114 + if (!(flags & FLAGS_SHARED)) 1115 + goto retry_private; 1116 + 1117 + goto retry; 1118 + } 1086 1119 1087 1120 out: 1088 1121 if (to) { ··· 1122 1091 destroy_hrtimer_on_stack(&to->timer); 1123 1092 } 1124 1093 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1125 - 1126 - uaddr_faulted: 1127 - futex_q_unlock(hb); 1128 - 1129 - ret = fault_in_user_writeable(uaddr); 1130 - if (ret) 1131 - goto out; 1132 - 1133 - if (!(flags & FLAGS_SHARED)) 1134 - goto retry_private; 1135 - 1136 - goto retry; 1137 1094 } 1138 1095 1139 1096 /* ··· 1133 1114 { 1134 1115 u32 curval, uval, vpid = task_pid_vnr(current); 1135 1116 union futex_key key = FUTEX_KEY_INIT; 1136 - struct futex_hash_bucket *hb; 1137 1117 struct futex_q *top_waiter; 1138 1118 int ret; 1139 1119 ··· 1152 1134 if (ret) 1153 1135 return ret; 1154 1136 1155 - hb = futex_hash(&key); 1137 + CLASS(hb, hb)(&key); 1156 1138 spin_lock(&hb->lock); 1157 1139 retry_hb: 1158 1140 ··· 1205 1187 */ 1206 1188 rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); 1207 1189 if (!rt_waiter) { 1190 + /* 1191 + * Acquire a reference for the leaving waiter to ensure 1192 + * valid futex_q::lock_ptr. 1193 + */ 1194 + futex_hash_get(hb); 1195 + top_waiter->drop_hb_ref = true; 1208 1196 __futex_unqueue(top_waiter); 1209 1197 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1210 1198 goto retry_hb;

+244 -236

kernel/futex/requeue.c

··· 87 87 futex_hb_waiters_inc(hb2); 88 88 plist_add(&q->list, &hb2->chain); 89 89 q->lock_ptr = &hb2->lock; 90 + /* 91 + * hb1 and hb2 belong to the same futex_hash_bucket_private 92 + * because if we managed get a reference on hb1 then it can't be 93 + * replaced. Therefore we avoid put(hb1)+get(hb2) here. 94 + */ 90 95 } 91 96 q->key = *key2; 92 97 } ··· 236 231 237 232 WARN_ON(!q->rt_waiter); 238 233 q->rt_waiter = NULL; 239 - 234 + /* 235 + * Acquire a reference for the waiter to ensure valid 236 + * futex_q::lock_ptr. 237 + */ 238 + futex_hash_get(hb); 239 + q->drop_hb_ref = true; 240 240 q->lock_ptr = &hb->lock; 241 241 242 242 /* Signal locked state to the waiter */ ··· 381 371 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 382 372 int task_count = 0, ret; 383 373 struct futex_pi_state *pi_state = NULL; 384 - struct futex_hash_bucket *hb1, *hb2; 385 374 struct futex_q *this, *next; 386 375 DEFINE_WAKE_Q(wake_q); 387 376 ··· 452 443 if (requeue_pi && futex_match(&key1, &key2)) 453 444 return -EINVAL; 454 445 455 - hb1 = futex_hash(&key1); 456 - hb2 = futex_hash(&key2); 457 - 458 446 retry_private: 459 - futex_hb_waiters_inc(hb2); 460 - double_lock_hb(hb1, hb2); 447 + if (1) { 448 + CLASS(hb, hb1)(&key1); 449 + CLASS(hb, hb2)(&key2); 461 450 462 - if (likely(cmpval != NULL)) { 463 - u32 curval; 451 + futex_hb_waiters_inc(hb2); 452 + double_lock_hb(hb1, hb2); 464 453 465 - ret = futex_get_value_locked(&curval, uaddr1); 454 + if (likely(cmpval != NULL)) { 455 + u32 curval; 466 456 467 - if (unlikely(ret)) { 468 - double_unlock_hb(hb1, hb2); 469 - futex_hb_waiters_dec(hb2); 457 + ret = futex_get_value_locked(&curval, uaddr1); 470 458 471 - ret = get_user(curval, uaddr1); 472 - if (ret) 473 - return ret; 459 + if (unlikely(ret)) { 460 + futex_hb_waiters_dec(hb2); 461 + double_unlock_hb(hb1, hb2); 474 462 475 - if (!(flags1 & FLAGS_SHARED)) 476 - goto retry_private; 463 + ret = get_user(curval, uaddr1); 464 + if (ret) 465 + return ret; 477 466 478 - goto retry; 479 - } 480 - if (curval != *cmpval) { 481 - ret = -EAGAIN; 482 - goto out_unlock; 483 - } 484 - } 467 + if (!(flags1 & FLAGS_SHARED)) 468 + goto retry_private; 485 469 486 - if (requeue_pi) { 487 - struct task_struct *exiting = NULL; 488 - 489 - /* 490 - * Attempt to acquire uaddr2 and wake the top waiter. If we 491 - * intend to requeue waiters, force setting the FUTEX_WAITERS 492 - * bit. We force this here where we are able to easily handle 493 - * faults rather in the requeue loop below. 494 - * 495 - * Updates topwaiter::requeue_state if a top waiter exists. 496 - */ 497 - ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, 498 - &key2, &pi_state, 499 - &exiting, nr_requeue); 500 - 501 - /* 502 - * At this point the top_waiter has either taken uaddr2 or 503 - * is waiting on it. In both cases pi_state has been 504 - * established and an initial refcount on it. In case of an 505 - * error there's nothing. 506 - * 507 - * The top waiter's requeue_state is up to date: 508 - * 509 - * - If the lock was acquired atomically (ret == 1), then 510 - * the state is Q_REQUEUE_PI_LOCKED. 511 - * 512 - * The top waiter has been dequeued and woken up and can 513 - * return to user space immediately. The kernel/user 514 - * space state is consistent. In case that there must be 515 - * more waiters requeued the WAITERS bit in the user 516 - * space futex is set so the top waiter task has to go 517 - * into the syscall slowpath to unlock the futex. This 518 - * will block until this requeue operation has been 519 - * completed and the hash bucket locks have been 520 - * dropped. 521 - * 522 - * - If the trylock failed with an error (ret < 0) then 523 - * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing 524 - * happened", or Q_REQUEUE_PI_IGNORE when there was an 525 - * interleaved early wakeup. 526 - * 527 - * - If the trylock did not succeed (ret == 0) then the 528 - * state is either Q_REQUEUE_PI_IN_PROGRESS or 529 - * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. 530 - * This will be cleaned up in the loop below, which 531 - * cannot fail because futex_proxy_trylock_atomic() did 532 - * the same sanity checks for requeue_pi as the loop 533 - * below does. 534 - */ 535 - switch (ret) { 536 - case 0: 537 - /* We hold a reference on the pi state. */ 538 - break; 539 - 540 - case 1: 541 - /* 542 - * futex_proxy_trylock_atomic() acquired the user space 543 - * futex. Adjust task_count. 544 - */ 545 - task_count++; 546 - ret = 0; 547 - break; 548 - 549 - /* 550 - * If the above failed, then pi_state is NULL and 551 - * waiter::requeue_state is correct. 552 - */ 553 - case -EFAULT: 554 - double_unlock_hb(hb1, hb2); 555 - futex_hb_waiters_dec(hb2); 556 - ret = fault_in_user_writeable(uaddr2); 557 - if (!ret) 558 470 goto retry; 559 - return ret; 560 - case -EBUSY: 561 - case -EAGAIN: 562 - /* 563 - * Two reasons for this: 564 - * - EBUSY: Owner is exiting and we just wait for the 565 - * exit to complete. 566 - * - EAGAIN: The user space value changed. 567 - */ 568 - double_unlock_hb(hb1, hb2); 569 - futex_hb_waiters_dec(hb2); 570 - /* 571 - * Handle the case where the owner is in the middle of 572 - * exiting. Wait for the exit to complete otherwise 573 - * this task might loop forever, aka. live lock. 574 - */ 575 - wait_for_owner_exiting(ret, exiting); 576 - cond_resched(); 577 - goto retry; 578 - default: 579 - goto out_unlock; 580 - } 581 - } 582 - 583 - plist_for_each_entry_safe(this, next, &hb1->chain, list) { 584 - if (task_count - nr_wake >= nr_requeue) 585 - break; 586 - 587 - if (!futex_match(&this->key, &key1)) 588 - continue; 589 - 590 - /* 591 - * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always 592 - * be paired with each other and no other futex ops. 593 - * 594 - * We should never be requeueing a futex_q with a pi_state, 595 - * which is awaiting a futex_unlock_pi(). 596 - */ 597 - if ((requeue_pi && !this->rt_waiter) || 598 - (!requeue_pi && this->rt_waiter) || 599 - this->pi_state) { 600 - ret = -EINVAL; 601 - break; 471 + } 472 + if (curval != *cmpval) { 473 + ret = -EAGAIN; 474 + goto out_unlock; 475 + } 602 476 } 603 477 604 - /* Plain futexes just wake or requeue and are done */ 605 - if (!requeue_pi) { 606 - if (++task_count <= nr_wake) 607 - this->wake(&wake_q, this); 608 - else 478 + if (requeue_pi) { 479 + struct task_struct *exiting = NULL; 480 + 481 + /* 482 + * Attempt to acquire uaddr2 and wake the top waiter. If we 483 + * intend to requeue waiters, force setting the FUTEX_WAITERS 484 + * bit. We force this here where we are able to easily handle 485 + * faults rather in the requeue loop below. 486 + * 487 + * Updates topwaiter::requeue_state if a top waiter exists. 488 + */ 489 + ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, 490 + &key2, &pi_state, 491 + &exiting, nr_requeue); 492 + 493 + /* 494 + * At this point the top_waiter has either taken uaddr2 or 495 + * is waiting on it. In both cases pi_state has been 496 + * established and an initial refcount on it. In case of an 497 + * error there's nothing. 498 + * 499 + * The top waiter's requeue_state is up to date: 500 + * 501 + * - If the lock was acquired atomically (ret == 1), then 502 + * the state is Q_REQUEUE_PI_LOCKED. 503 + * 504 + * The top waiter has been dequeued and woken up and can 505 + * return to user space immediately. The kernel/user 506 + * space state is consistent. In case that there must be 507 + * more waiters requeued the WAITERS bit in the user 508 + * space futex is set so the top waiter task has to go 509 + * into the syscall slowpath to unlock the futex. This 510 + * will block until this requeue operation has been 511 + * completed and the hash bucket locks have been 512 + * dropped. 513 + * 514 + * - If the trylock failed with an error (ret < 0) then 515 + * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing 516 + * happened", or Q_REQUEUE_PI_IGNORE when there was an 517 + * interleaved early wakeup. 518 + * 519 + * - If the trylock did not succeed (ret == 0) then the 520 + * state is either Q_REQUEUE_PI_IN_PROGRESS or 521 + * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. 522 + * This will be cleaned up in the loop below, which 523 + * cannot fail because futex_proxy_trylock_atomic() did 524 + * the same sanity checks for requeue_pi as the loop 525 + * below does. 526 + */ 527 + switch (ret) { 528 + case 0: 529 + /* We hold a reference on the pi state. */ 530 + break; 531 + 532 + case 1: 533 + /* 534 + * futex_proxy_trylock_atomic() acquired the user space 535 + * futex. Adjust task_count. 536 + */ 537 + task_count++; 538 + ret = 0; 539 + break; 540 + 541 + /* 542 + * If the above failed, then pi_state is NULL and 543 + * waiter::requeue_state is correct. 544 + */ 545 + case -EFAULT: 546 + futex_hb_waiters_dec(hb2); 547 + double_unlock_hb(hb1, hb2); 548 + ret = fault_in_user_writeable(uaddr2); 549 + if (!ret) 550 + goto retry; 551 + return ret; 552 + case -EBUSY: 553 + case -EAGAIN: 554 + /* 555 + * Two reasons for this: 556 + * - EBUSY: Owner is exiting and we just wait for the 557 + * exit to complete. 558 + * - EAGAIN: The user space value changed. 559 + */ 560 + futex_hb_waiters_dec(hb2); 561 + double_unlock_hb(hb1, hb2); 562 + /* 563 + * Handle the case where the owner is in the middle of 564 + * exiting. Wait for the exit to complete otherwise 565 + * this task might loop forever, aka. live lock. 566 + */ 567 + wait_for_owner_exiting(ret, exiting); 568 + cond_resched(); 569 + goto retry; 570 + default: 571 + goto out_unlock; 572 + } 573 + } 574 + 575 + plist_for_each_entry_safe(this, next, &hb1->chain, list) { 576 + if (task_count - nr_wake >= nr_requeue) 577 + break; 578 + 579 + if (!futex_match(&this->key, &key1)) 580 + continue; 581 + 582 + /* 583 + * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always 584 + * be paired with each other and no other futex ops. 585 + * 586 + * We should never be requeueing a futex_q with a pi_state, 587 + * which is awaiting a futex_unlock_pi(). 588 + */ 589 + if ((requeue_pi && !this->rt_waiter) || 590 + (!requeue_pi && this->rt_waiter) || 591 + this->pi_state) { 592 + ret = -EINVAL; 593 + break; 594 + } 595 + 596 + /* Plain futexes just wake or requeue and are done */ 597 + if (!requeue_pi) { 598 + if (++task_count <= nr_wake) 599 + this->wake(&wake_q, this); 600 + else 601 + requeue_futex(this, hb1, hb2, &key2); 602 + continue; 603 + } 604 + 605 + /* Ensure we requeue to the expected futex for requeue_pi. */ 606 + if (!futex_match(this->requeue_pi_key, &key2)) { 607 + ret = -EINVAL; 608 + break; 609 + } 610 + 611 + /* 612 + * Requeue nr_requeue waiters and possibly one more in the case 613 + * of requeue_pi if we couldn't acquire the lock atomically. 614 + * 615 + * Prepare the waiter to take the rt_mutex. Take a refcount 616 + * on the pi_state and store the pointer in the futex_q 617 + * object of the waiter. 618 + */ 619 + get_pi_state(pi_state); 620 + 621 + /* Don't requeue when the waiter is already on the way out. */ 622 + if (!futex_requeue_pi_prepare(this, pi_state)) { 623 + /* 624 + * Early woken waiter signaled that it is on the 625 + * way out. Drop the pi_state reference and try the 626 + * next waiter. @this->pi_state is still NULL. 627 + */ 628 + put_pi_state(pi_state); 629 + continue; 630 + } 631 + 632 + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, 633 + this->rt_waiter, 634 + this->task); 635 + 636 + if (ret == 1) { 637 + /* 638 + * We got the lock. We do neither drop the refcount 639 + * on pi_state nor clear this->pi_state because the 640 + * waiter needs the pi_state for cleaning up the 641 + * user space value. It will drop the refcount 642 + * after doing so. this::requeue_state is updated 643 + * in the wakeup as well. 644 + */ 645 + requeue_pi_wake_futex(this, &key2, hb2); 646 + task_count++; 647 + } else if (!ret) { 648 + /* Waiter is queued, move it to hb2 */ 609 649 requeue_futex(this, hb1, hb2, &key2); 610 - continue; 611 - } 612 - 613 - /* Ensure we requeue to the expected futex for requeue_pi. */ 614 - if (!futex_match(this->requeue_pi_key, &key2)) { 615 - ret = -EINVAL; 616 - break; 650 + futex_requeue_pi_complete(this, 0); 651 + task_count++; 652 + } else { 653 + /* 654 + * rt_mutex_start_proxy_lock() detected a potential 655 + * deadlock when we tried to queue that waiter. 656 + * Drop the pi_state reference which we took above 657 + * and remove the pointer to the state from the 658 + * waiters futex_q object. 659 + */ 660 + this->pi_state = NULL; 661 + put_pi_state(pi_state); 662 + futex_requeue_pi_complete(this, ret); 663 + /* 664 + * We stop queueing more waiters and let user space 665 + * deal with the mess. 666 + */ 667 + break; 668 + } 617 669 } 618 670 619 671 /* 620 - * Requeue nr_requeue waiters and possibly one more in the case 621 - * of requeue_pi if we couldn't acquire the lock atomically. 622 - * 623 - * Prepare the waiter to take the rt_mutex. Take a refcount 624 - * on the pi_state and store the pointer in the futex_q 625 - * object of the waiter. 672 + * We took an extra initial reference to the pi_state in 673 + * futex_proxy_trylock_atomic(). We need to drop it here again. 626 674 */ 627 - get_pi_state(pi_state); 628 - 629 - /* Don't requeue when the waiter is already on the way out. */ 630 - if (!futex_requeue_pi_prepare(this, pi_state)) { 631 - /* 632 - * Early woken waiter signaled that it is on the 633 - * way out. Drop the pi_state reference and try the 634 - * next waiter. @this->pi_state is still NULL. 635 - */ 636 - put_pi_state(pi_state); 637 - continue; 638 - } 639 - 640 - ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, 641 - this->rt_waiter, 642 - this->task); 643 - 644 - if (ret == 1) { 645 - /* 646 - * We got the lock. We do neither drop the refcount 647 - * on pi_state nor clear this->pi_state because the 648 - * waiter needs the pi_state for cleaning up the 649 - * user space value. It will drop the refcount 650 - * after doing so. this::requeue_state is updated 651 - * in the wakeup as well. 652 - */ 653 - requeue_pi_wake_futex(this, &key2, hb2); 654 - task_count++; 655 - } else if (!ret) { 656 - /* Waiter is queued, move it to hb2 */ 657 - requeue_futex(this, hb1, hb2, &key2); 658 - futex_requeue_pi_complete(this, 0); 659 - task_count++; 660 - } else { 661 - /* 662 - * rt_mutex_start_proxy_lock() detected a potential 663 - * deadlock when we tried to queue that waiter. 664 - * Drop the pi_state reference which we took above 665 - * and remove the pointer to the state from the 666 - * waiters futex_q object. 667 - */ 668 - this->pi_state = NULL; 669 - put_pi_state(pi_state); 670 - futex_requeue_pi_complete(this, ret); 671 - /* 672 - * We stop queueing more waiters and let user space 673 - * deal with the mess. 674 - */ 675 - break; 676 - } 677 - } 678 - 679 - /* 680 - * We took an extra initial reference to the pi_state in 681 - * futex_proxy_trylock_atomic(). We need to drop it here again. 682 - */ 683 - put_pi_state(pi_state); 675 + put_pi_state(pi_state); 684 676 685 677 out_unlock: 686 - double_unlock_hb(hb1, hb2); 678 + futex_hb_waiters_dec(hb2); 679 + double_unlock_hb(hb1, hb2); 680 + } 687 681 wake_up_q(&wake_q); 688 - futex_hb_waiters_dec(hb2); 689 682 return ret ? ret : task_count; 690 683 } 691 684 ··· 780 769 { 781 770 struct hrtimer_sleeper timeout, *to; 782 771 struct rt_mutex_waiter rt_waiter; 783 - struct futex_hash_bucket *hb; 784 772 union futex_key key2 = FUTEX_KEY_INIT; 785 773 struct futex_q q = futex_q_init; 786 774 struct rt_mutex_base *pi_mutex; ··· 815 805 * Prepare to wait on uaddr. On success, it holds hb->lock and q 816 806 * is initialized. 817 807 */ 818 - ret = futex_wait_setup(uaddr, val, flags, &q, &hb); 808 + ret = futex_wait_setup(uaddr, val, flags, &q, &key2, current); 819 809 if (ret) 820 810 goto out; 821 811 822 - /* 823 - * The check above which compares uaddrs is not sufficient for 824 - * shared futexes. We need to compare the keys: 825 - */ 826 - if (futex_match(&q.key, &key2)) { 827 - futex_q_unlock(hb); 828 - ret = -EINVAL; 829 - goto out; 830 - } 831 - 832 812 /* Queue the futex_q, drop the hb lock, wait for wakeup. */ 833 - futex_wait_queue(hb, &q, to); 813 + futex_do_wait(&q, to); 834 814 835 815 switch (futex_requeue_pi_wakeup_sync(&q)) { 836 816 case Q_REQUEUE_PI_IGNORE: 837 - /* The waiter is still on uaddr1 */ 838 - spin_lock(&hb->lock); 839 - ret = handle_early_requeue_pi_wakeup(hb, &q, to); 840 - spin_unlock(&hb->lock); 817 + { 818 + CLASS(hb, hb)(&q.key); 819 + /* The waiter is still on uaddr1 */ 820 + spin_lock(&hb->lock); 821 + ret = handle_early_requeue_pi_wakeup(hb, &q, to); 822 + spin_unlock(&hb->lock); 823 + } 841 824 break; 842 825 843 826 case Q_REQUEUE_PI_LOCKED: 844 827 /* The requeue acquired the lock */ 845 828 if (q.pi_state && (q.pi_state->owner != current)) { 846 - spin_lock(q.lock_ptr); 829 + futex_q_lockptr_lock(&q); 847 830 ret = fixup_pi_owner(uaddr2, &q, true); 848 831 /* 849 832 * Drop the reference to the pi state which the ··· 863 860 if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) 864 861 ret = 0; 865 862 866 - spin_lock(q.lock_ptr); 863 + futex_q_lockptr_lock(&q); 867 864 debug_rt_mutex_free_waiter(&rt_waiter); 868 865 /* 869 866 * Fixup the pi_state owner and possibly acquire the lock if we ··· 894 891 break; 895 892 default: 896 893 BUG(); 894 + } 895 + if (q.drop_hb_ref) { 896 + CLASS(hb, hb)(&q.key); 897 + /* Additional reference from requeue_pi_wake_futex() */ 898 + futex_hash_put(hb); 897 899 } 898 900 899 901 out:

+113 -94

kernel/futex/waitwake.c

··· 154 154 */ 155 155 int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) 156 156 { 157 - struct futex_hash_bucket *hb; 158 157 struct futex_q *this, *next; 159 158 union futex_key key = FUTEX_KEY_INIT; 160 159 DEFINE_WAKE_Q(wake_q); ··· 169 170 if ((flags & FLAGS_STRICT) && !nr_wake) 170 171 return 0; 171 172 172 - hb = futex_hash(&key); 173 + CLASS(hb, hb)(&key); 173 174 174 175 /* Make sure we really have tasks to wakeup */ 175 176 if (!futex_hb_waiters_pending(hb)) ··· 252 253 int nr_wake, int nr_wake2, int op) 253 254 { 254 255 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 255 - struct futex_hash_bucket *hb1, *hb2; 256 256 struct futex_q *this, *next; 257 257 int ret, op_ret; 258 258 DEFINE_WAKE_Q(wake_q); ··· 264 266 if (unlikely(ret != 0)) 265 267 return ret; 266 268 267 - hb1 = futex_hash(&key1); 268 - hb2 = futex_hash(&key2); 269 - 270 269 retry_private: 271 - double_lock_hb(hb1, hb2); 272 - op_ret = futex_atomic_op_inuser(op, uaddr2); 273 - if (unlikely(op_ret < 0)) { 274 - double_unlock_hb(hb1, hb2); 270 + if (1) { 271 + CLASS(hb, hb1)(&key1); 272 + CLASS(hb, hb2)(&key2); 275 273 276 - if (!IS_ENABLED(CONFIG_MMU) || 277 - unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { 278 - /* 279 - * we don't get EFAULT from MMU faults if we don't have 280 - * an MMU, but we might get them from range checking 281 - */ 282 - ret = op_ret; 283 - return ret; 284 - } 274 + double_lock_hb(hb1, hb2); 275 + op_ret = futex_atomic_op_inuser(op, uaddr2); 276 + if (unlikely(op_ret < 0)) { 277 + double_unlock_hb(hb1, hb2); 285 278 286 - if (op_ret == -EFAULT) { 287 - ret = fault_in_user_writeable(uaddr2); 288 - if (ret) 279 + if (!IS_ENABLED(CONFIG_MMU) || 280 + unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { 281 + /* 282 + * we don't get EFAULT from MMU faults if we don't have 283 + * an MMU, but we might get them from range checking 284 + */ 285 + ret = op_ret; 289 286 return ret; 290 - } 291 - 292 - cond_resched(); 293 - if (!(flags & FLAGS_SHARED)) 294 - goto retry_private; 295 - goto retry; 296 - } 297 - 298 - plist_for_each_entry_safe(this, next, &hb1->chain, list) { 299 - if (futex_match (&this->key, &key1)) { 300 - if (this->pi_state || this->rt_waiter) { 301 - ret = -EINVAL; 302 - goto out_unlock; 303 287 } 304 - this->wake(&wake_q, this); 305 - if (++ret >= nr_wake) 306 - break; 307 - } 308 - } 309 288 310 - if (op_ret > 0) { 311 - op_ret = 0; 312 - plist_for_each_entry_safe(this, next, &hb2->chain, list) { 313 - if (futex_match (&this->key, &key2)) { 289 + if (op_ret == -EFAULT) { 290 + ret = fault_in_user_writeable(uaddr2); 291 + if (ret) 292 + return ret; 293 + } 294 + 295 + cond_resched(); 296 + if (!(flags & FLAGS_SHARED)) 297 + goto retry_private; 298 + goto retry; 299 + } 300 + 301 + plist_for_each_entry_safe(this, next, &hb1->chain, list) { 302 + if (futex_match(&this->key, &key1)) { 314 303 if (this->pi_state || this->rt_waiter) { 315 304 ret = -EINVAL; 316 305 goto out_unlock; 317 306 } 318 307 this->wake(&wake_q, this); 319 - if (++op_ret >= nr_wake2) 308 + if (++ret >= nr_wake) 320 309 break; 321 310 } 322 311 } 323 - ret += op_ret; 324 - } 312 + 313 + if (op_ret > 0) { 314 + op_ret = 0; 315 + plist_for_each_entry_safe(this, next, &hb2->chain, list) { 316 + if (futex_match(&this->key, &key2)) { 317 + if (this->pi_state || this->rt_waiter) { 318 + ret = -EINVAL; 319 + goto out_unlock; 320 + } 321 + this->wake(&wake_q, this); 322 + if (++op_ret >= nr_wake2) 323 + break; 324 + } 325 + } 326 + ret += op_ret; 327 + } 325 328 326 329 out_unlock: 327 - double_unlock_hb(hb1, hb2); 330 + double_unlock_hb(hb1, hb2); 331 + } 328 332 wake_up_q(&wake_q); 329 333 return ret; 330 334 } ··· 334 334 static long futex_wait_restart(struct restart_block *restart); 335 335 336 336 /** 337 - * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal 338 - * @hb: the futex hash bucket, must be locked by the caller 337 + * futex_do_wait() - wait for wakeup, timeout, or signal 339 338 * @q: the futex_q to queue up on 340 339 * @timeout: the prepared hrtimer_sleeper, or null for no timeout 341 340 */ 342 - void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, 343 - struct hrtimer_sleeper *timeout) 341 + void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout) 344 342 { 345 - /* 346 - * The task state is guaranteed to be set before another task can 347 - * wake it. set_current_state() is implemented using smp_store_mb() and 348 - * futex_queue() calls spin_unlock() upon completion, both serializing 349 - * access to the hash list and forcing another memory barrier. 350 - */ 351 - set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); 352 - futex_queue(q, hb, current); 353 - 354 343 /* Arm the timer */ 355 344 if (timeout) 356 345 hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); ··· 401 412 */ 402 413 int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) 403 414 { 404 - struct futex_hash_bucket *hb; 405 415 bool retry = false; 406 416 int ret, i; 407 417 u32 uval; 418 + 419 + /* 420 + * Make sure to have a reference on the private_hash such that we 421 + * don't block on rehash after changing the task state below. 422 + */ 423 + guard(private_hash)(); 408 424 409 425 /* 410 426 * Enqueuing multiple futexes is tricky, because we need to enqueue ··· 445 451 struct futex_q *q = &vs[i].q; 446 452 u32 val = vs[i].w.val; 447 453 448 - hb = futex_q_lock(q); 449 - ret = futex_get_value_locked(&uval, uaddr); 454 + if (1) { 455 + CLASS(hb, hb)(&q->key); 450 456 451 - if (!ret && uval == val) { 452 - /* 453 - * The bucket lock can't be held while dealing with the 454 - * next futex. Queue each futex at this moment so hb can 455 - * be unlocked. 456 - */ 457 - futex_queue(q, hb, current); 458 - continue; 457 + futex_q_lock(q, hb); 458 + ret = futex_get_value_locked(&uval, uaddr); 459 + 460 + if (!ret && uval == val) { 461 + /* 462 + * The bucket lock can't be held while dealing with the 463 + * next futex. Queue each futex at this moment so hb can 464 + * be unlocked. 465 + */ 466 + futex_queue(q, hb, current); 467 + continue; 468 + } 469 + 470 + futex_q_unlock(hb); 459 471 } 460 - 461 - futex_q_unlock(hb); 462 472 __set_current_state(TASK_RUNNING); 463 473 464 474 /* ··· 576 578 * @val: the expected value 577 579 * @flags: futex flags (FLAGS_SHARED, etc.) 578 580 * @q: the associated futex_q 579 - * @hb: storage for hash_bucket pointer to be returned to caller 581 + * @key2: the second futex_key if used for requeue PI 582 + * @task: Task queueing this futex 580 583 * 581 584 * Setup the futex_q and locate the hash_bucket. Get the futex value and 582 585 * compare it with the expected value. Handle atomic faults internally. ··· 585 586 * 586 587 * Return: 587 588 * - 0 - uaddr contains val and hb has been locked; 588 - * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked 589 + * - <0 - On error and the hb is unlocked. A possible reason: the uaddr can not 590 + * be read, does not contain the expected value or is not properly aligned. 589 591 */ 590 592 int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, 591 - struct futex_q *q, struct futex_hash_bucket **hb) 593 + struct futex_q *q, union futex_key *key2, 594 + struct task_struct *task) 592 595 { 593 596 u32 uval; 594 597 int ret; ··· 619 618 return ret; 620 619 621 620 retry_private: 622 - *hb = futex_q_lock(q); 621 + if (1) { 622 + CLASS(hb, hb)(&q->key); 623 623 624 - ret = futex_get_value_locked(&uval, uaddr); 624 + futex_q_lock(q, hb); 625 625 626 - if (ret) { 627 - futex_q_unlock(*hb); 626 + ret = futex_get_value_locked(&uval, uaddr); 628 627 629 - ret = get_user(uval, uaddr); 630 - if (ret) 631 - return ret; 628 + if (ret) { 629 + futex_q_unlock(hb); 632 630 633 - if (!(flags & FLAGS_SHARED)) 634 - goto retry_private; 631 + ret = get_user(uval, uaddr); 632 + if (ret) 633 + return ret; 635 634 636 - goto retry; 637 - } 635 + if (!(flags & FLAGS_SHARED)) 636 + goto retry_private; 638 637 639 - if (uval != val) { 640 - futex_q_unlock(*hb); 641 - ret = -EWOULDBLOCK; 638 + goto retry; 639 + } 640 + 641 + if (uval != val) { 642 + futex_q_unlock(hb); 643 + return -EWOULDBLOCK; 644 + } 645 + 646 + if (key2 && futex_match(&q->key, key2)) { 647 + futex_q_unlock(hb); 648 + return -EINVAL; 649 + } 650 + 651 + /* 652 + * The task state is guaranteed to be set before another task can 653 + * wake it. set_current_state() is implemented using smp_store_mb() and 654 + * futex_queue() calls spin_unlock() upon completion, both serializing 655 + * access to the hash list and forcing another memory barrier. 656 + */ 657 + if (task == current) 658 + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); 659 + futex_queue(q, hb, task); 642 660 } 643 661 644 662 return ret; ··· 667 647 struct hrtimer_sleeper *to, u32 bitset) 668 648 { 669 649 struct futex_q q = futex_q_init; 670 - struct futex_hash_bucket *hb; 671 650 int ret; 672 651 673 652 if (!bitset) ··· 679 660 * Prepare to wait on uaddr. On success, it holds hb->lock and q 680 661 * is initialized. 681 662 */ 682 - ret = futex_wait_setup(uaddr, val, flags, &q, &hb); 663 + ret = futex_wait_setup(uaddr, val, flags, &q, NULL, current); 683 664 if (ret) 684 665 return ret; 685 666 686 667 /* futex_queue and wait for wakeup, timeout, or a signal. */ 687 - futex_wait_queue(hb, &q, to); 668 + futex_do_wait(&q, to); 688 669 689 670 /* If we were woken (and unqueued), we succeeded, whatever. */ 690 671 if (!futex_unqueue(&q))

+41 -35

kernel/locking/lockdep.c

··· 219 219 static struct hlist_head lock_keys_hash[KEYHASH_SIZE]; 220 220 unsigned long nr_lock_classes; 221 221 unsigned long nr_zapped_classes; 222 + unsigned long nr_dynamic_keys; 222 223 unsigned long max_lock_class_idx; 223 224 struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; 224 225 DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); ··· 1239 1238 goto out_unlock; 1240 1239 } 1241 1240 hlist_add_head_rcu(&key->hash_entry, hash_head); 1241 + nr_dynamic_keys++; 1242 1242 out_unlock: 1243 1243 graph_unlock(); 1244 1244 restore_irqs: ··· 1976 1974 pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); 1977 1975 1978 1976 print_circular_bug_entry(entry, depth); 1979 - } 1980 - 1981 - /* 1982 - * We are about to add A -> B into the dependency graph, and in __bfs() a 1983 - * strong dependency path A -> .. -> B is found: hlock_class equals 1984 - * entry->class. 1985 - * 1986 - * If A -> .. -> B can replace A -> B in any __bfs() search (means the former 1987 - * is _stronger_ than or equal to the latter), we consider A -> B as redundant. 1988 - * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A 1989 - * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the 1990 - * dependency graph, as any strong path ..-> A -> B ->.. we can get with 1991 - * having dependency A -> B, we could already get a equivalent path ..-> A -> 1992 - * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant. 1993 - * 1994 - * We need to make sure both the start and the end of A -> .. -> B is not 1995 - * weaker than A -> B. For the start part, please see the comment in 1996 - * check_redundant(). For the end part, we need: 1997 - * 1998 - * Either 1999 - * 2000 - * a) A -> B is -(*R)-> (everything is not weaker than that) 2001 - * 2002 - * or 2003 - * 2004 - * b) A -> .. -> B is -(*N)-> (nothing is stronger than this) 2005 - * 2006 - */ 2007 - static inline bool hlock_equal(struct lock_list *entry, void *data) 2008 - { 2009 - struct held_lock *hlock = (struct held_lock *)data; 2010 - 2011 - return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ 2012 - (hlock->read == 2 || /* A -> B is -(*R)-> */ 2013 - !entry->only_xr); /* A -> .. -> B is -(*N)-> */ 2014 1977 } 2015 1978 2016 1979 /* ··· 2882 2915 #endif /* CONFIG_TRACE_IRQFLAGS */ 2883 2916 2884 2917 #ifdef CONFIG_LOCKDEP_SMALL 2918 + /* 2919 + * We are about to add A -> B into the dependency graph, and in __bfs() a 2920 + * strong dependency path A -> .. -> B is found: hlock_class equals 2921 + * entry->class. 2922 + * 2923 + * If A -> .. -> B can replace A -> B in any __bfs() search (means the former 2924 + * is _stronger_ than or equal to the latter), we consider A -> B as redundant. 2925 + * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A 2926 + * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the 2927 + * dependency graph, as any strong path ..-> A -> B ->.. we can get with 2928 + * having dependency A -> B, we could already get a equivalent path ..-> A -> 2929 + * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant. 2930 + * 2931 + * We need to make sure both the start and the end of A -> .. -> B is not 2932 + * weaker than A -> B. For the start part, please see the comment in 2933 + * check_redundant(). For the end part, we need: 2934 + * 2935 + * Either 2936 + * 2937 + * a) A -> B is -(*R)-> (everything is not weaker than that) 2938 + * 2939 + * or 2940 + * 2941 + * b) A -> .. -> B is -(*N)-> (nothing is stronger than this) 2942 + * 2943 + */ 2944 + static inline bool hlock_equal(struct lock_list *entry, void *data) 2945 + { 2946 + struct held_lock *hlock = (struct held_lock *)data; 2947 + 2948 + return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ 2949 + (hlock->read == 2 || /* A -> B is -(*R)-> */ 2950 + !entry->only_xr); /* A -> .. -> B is -(*N)-> */ 2951 + } 2952 + 2885 2953 /* 2886 2954 * Check that the dependency graph starting at <src> can lead to 2887 2955 * <target> or not. If it can, <src> -> <target> dependency is already ··· 5103 5101 lockevent_inc(lockdep_nocheck); 5104 5102 } 5105 5103 5104 + if (DEBUG_LOCKS_WARN_ON(subclass >= MAX_LOCKDEP_SUBCLASSES)) 5105 + return 0; 5106 + 5106 5107 if (subclass < NR_LOCKDEP_CACHING_CLASSES) 5107 5108 class = lock->class_cache[subclass]; 5108 5109 /* ··· 6611 6606 pf = get_pending_free(); 6612 6607 __lockdep_free_key_range(pf, key, 1); 6613 6608 need_callback = prepare_call_rcu_zapped(pf); 6609 + nr_dynamic_keys--; 6614 6610 } 6615 6611 lockdep_unlock(); 6616 6612 raw_local_irq_restore(flags);

+1

kernel/locking/lockdep_internals.h

··· 138 138 extern unsigned long nr_zapped_classes; 139 139 extern unsigned long nr_zapped_lock_chains; 140 140 extern unsigned long nr_list_entries; 141 + extern unsigned long nr_dynamic_keys; 141 142 long lockdep_next_lockchain(long i); 142 143 unsigned long lock_chain_count(void); 143 144 extern unsigned long nr_stack_trace_entries;

+2

kernel/locking/lockdep_proc.c

··· 286 286 #endif 287 287 seq_printf(m, " lock-classes: %11lu [max: %lu]\n", 288 288 nr_lock_classes, MAX_LOCKDEP_KEYS); 289 + seq_printf(m, " dynamic-keys: %11lu\n", 290 + nr_dynamic_keys); 289 291 seq_printf(m, " direct dependencies: %11lu [max: %lu]\n", 290 292 nr_list_entries, MAX_LOCKDEP_ENTRIES); 291 293 seq_printf(m, " indirect dependencies: %11lu\n",

+4

kernel/sys.c

··· 52 52 #include <linux/user_namespace.h> 53 53 #include <linux/time_namespace.h> 54 54 #include <linux/binfmts.h> 55 + #include <linux/futex.h> 55 56 56 57 #include <linux/sched.h> 57 58 #include <linux/sched/autogroup.h> ··· 2820 2819 if (arg3 || arg4 || arg5) 2821 2820 return -EINVAL; 2822 2821 error = posixtimer_create_prctl(arg2); 2822 + break; 2823 + case PR_FUTEX_HASH: 2824 + error = futex_hash_prctl(arg2, arg3, arg4); 2823 2825 break; 2824 2826 default: 2825 2827 trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);

+17 -1

mm/nommu.c

··· 200 200 } 201 201 EXPORT_SYMBOL(vmalloc_noprof); 202 202 203 - void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof); 203 + /* 204 + * vmalloc_huge_node - allocate virtually contiguous memory, on a node 205 + * 206 + * @size: allocation size 207 + * @gfp_mask: flags for the page level allocator 208 + * @node: node to use for allocation or NUMA_NO_NODE 209 + * 210 + * Allocate enough pages to cover @size from the page level 211 + * allocator and map them into contiguous kernel virtual space. 212 + * 213 + * Due to NOMMU implications the node argument and HUGE page attribute is 214 + * ignored. 215 + */ 216 + void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) 217 + { 218 + return __vmalloc_noprof(size, gfp_mask); 219 + } 204 220 205 221 /* 206 222 * vzalloc - allocate virtually contiguous memory with zero fill

+6 -5

mm/vmalloc.c

··· 3944 3944 EXPORT_SYMBOL(vmalloc_noprof); 3945 3945 3946 3946 /** 3947 - * vmalloc_huge - allocate virtually contiguous memory, allow huge pages 3947 + * vmalloc_huge_node - allocate virtually contiguous memory, allow huge pages 3948 3948 * @size: allocation size 3949 3949 * @gfp_mask: flags for the page level allocator 3950 + * @node: node to use for allocation or NUMA_NO_NODE 3950 3951 * 3951 3952 * Allocate enough pages to cover @size from the page level 3952 3953 * allocator and map them into contiguous kernel virtual space. ··· 3956 3955 * 3957 3956 * Return: pointer to the allocated memory or %NULL on error 3958 3957 */ 3959 - void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) 3958 + void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) 3960 3959 { 3961 3960 return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, 3962 - gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, 3963 - NUMA_NO_NODE, __builtin_return_address(0)); 3961 + gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, 3962 + node, __builtin_return_address(0)); 3964 3963 } 3965 - EXPORT_SYMBOL_GPL(vmalloc_huge_noprof); 3964 + EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof); 3966 3965 3967 3966 /** 3968 3967 * vzalloc - allocate virtually contiguous memory with zero fill

+44 -1

tools/include/uapi/linux/prctl.h

··· 230 230 # define PR_PAC_APDBKEY (1UL << 3) 231 231 # define PR_PAC_APGAKEY (1UL << 4) 232 232 233 - /* Tagged user address controls for arm64 */ 233 + /* Tagged user address controls for arm64 and RISC-V */ 234 234 #define PR_SET_TAGGED_ADDR_CTRL 55 235 235 #define PR_GET_TAGGED_ADDR_CTRL 56 236 236 # define PR_TAGGED_ADDR_ENABLE (1UL << 0) ··· 244 244 # define PR_MTE_TAG_MASK (0xffffUL << PR_MTE_TAG_SHIFT) 245 245 /* Unused; kept only for source compatibility */ 246 246 # define PR_MTE_TCF_SHIFT 1 247 + /* RISC-V pointer masking tag length */ 248 + # define PR_PMLEN_SHIFT 24 249 + # define PR_PMLEN_MASK (0x7fUL << PR_PMLEN_SHIFT) 247 250 248 251 /* Control reclaim behavior when allocating memory */ 249 252 #define PR_SET_IO_FLUSHER 57 ··· 330 327 # define PR_PPC_DEXCR_CTRL_SET_ONEXEC 0x8 /* Set the aspect on exec */ 331 328 # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC 0x10 /* Clear the aspect on exec */ 332 329 # define PR_PPC_DEXCR_CTRL_MASK 0x1f 330 + 331 + /* 332 + * Get the current shadow stack configuration for the current thread, 333 + * this will be the value configured via PR_SET_SHADOW_STACK_STATUS. 334 + */ 335 + #define PR_GET_SHADOW_STACK_STATUS 74 336 + 337 + /* 338 + * Set the current shadow stack configuration. Enabling the shadow 339 + * stack will cause a shadow stack to be allocated for the thread. 340 + */ 341 + #define PR_SET_SHADOW_STACK_STATUS 75 342 + # define PR_SHADOW_STACK_ENABLE (1UL << 0) 343 + # define PR_SHADOW_STACK_WRITE (1UL << 1) 344 + # define PR_SHADOW_STACK_PUSH (1UL << 2) 345 + 346 + /* 347 + * Prevent further changes to the specified shadow stack 348 + * configuration. All bits may be locked via this call, including 349 + * undefined bits. 350 + */ 351 + #define PR_LOCK_SHADOW_STACK_STATUS 76 352 + 353 + /* 354 + * Controls the mode of timer_create() for CRIU restore operations. 355 + * Enabling this allows CRIU to restore timers with explicit IDs. 356 + * 357 + * Don't use for normal operations as the result might be undefined. 358 + */ 359 + #define PR_TIMER_CREATE_RESTORE_IDS 77 360 + # define PR_TIMER_CREATE_RESTORE_IDS_OFF 0 361 + # define PR_TIMER_CREATE_RESTORE_IDS_ON 1 362 + # define PR_TIMER_CREATE_RESTORE_IDS_GET 2 363 + 364 + /* FUTEX hash management */ 365 + #define PR_FUTEX_HASH 78 366 + # define PR_FUTEX_HASH_SET_SLOTS 1 367 + # define FH_FLAG_IMMUTABLE (1ULL << 0) 368 + # define PR_FUTEX_HASH_GET_SLOTS 2 369 + # define PR_FUTEX_HASH_GET_IMMUTABLE 3 333 370 334 371 #endif /* _LINUX_PRCTL_H */

+1

tools/perf/bench/Build

··· 3 3 perf-bench-y += sched-seccomp-notify.o 4 4 perf-bench-y += syscall.o 5 5 perf-bench-y += mem-functions.o 6 + perf-bench-y += futex.o 6 7 perf-bench-y += futex-hash.o 7 8 perf-bench-y += futex-wake.o 8 9 perf-bench-y += futex-wake-parallel.o

+7

tools/perf/bench/futex-hash.c

··· 18 18 #include <stdlib.h> 19 19 #include <linux/compiler.h> 20 20 #include <linux/kernel.h> 21 + #include <linux/prctl.h> 21 22 #include <linux/zalloc.h> 22 23 #include <sys/time.h> 23 24 #include <sys/mman.h> 25 + #include <sys/prctl.h> 24 26 #include <perf/cpumap.h> 25 27 26 28 #include "../util/mutex.h" ··· 52 50 static struct bench_futex_parameters params = { 53 51 .nfutexes = 1024, 54 52 .runtime = 10, 53 + .nbuckets = -1, 55 54 }; 56 55 57 56 static const struct option options[] = { 57 + OPT_INTEGER( 'b', "buckets", &params.nbuckets, "Specify amount of hash buckets"), 58 + OPT_BOOLEAN( 'I', "immutable", &params.buckets_immutable, "Make the hash buckets immutable"), 58 59 OPT_UINTEGER('t', "threads", &params.nthreads, "Specify amount of threads"), 59 60 OPT_UINTEGER('r', "runtime", &params.runtime, "Specify runtime (in seconds)"), 60 61 OPT_UINTEGER('f', "futexes", &params.nfutexes, "Specify amount of futexes per threads"), ··· 123 118 printf("%sAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n", 124 119 !params.silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg), 125 120 (int)bench__runtime.tv_sec); 121 + futex_print_nbuckets(&params); 126 122 } 127 123 128 124 int bench_futex_hash(int argc, const char **argv) ··· 167 161 168 162 if (!params.fshared) 169 163 futex_flag = FUTEX_PRIVATE_FLAG; 164 + futex_set_nbuckets_param(&params); 170 165 171 166 printf("Run summary [PID %d]: %d threads, each operating on %d [%s] futexes for %d secs.\n\n", 172 167 getpid(), params.nthreads, params.nfutexes, params.fshared ? "shared":"private", params.runtime);

+5

tools/perf/bench/futex-lock-pi.c

··· 41 41 static struct cond thread_parent, thread_worker; 42 42 43 43 static struct bench_futex_parameters params = { 44 + .nbuckets = -1, 44 45 .runtime = 10, 45 46 }; 46 47 47 48 static const struct option options[] = { 49 + OPT_INTEGER( 'b', "buckets", &params.nbuckets, "Specify amount of hash buckets"), 50 + OPT_BOOLEAN( 'I', "immutable", &params.buckets_immutable, "Make the hash buckets immutable"), 48 51 OPT_UINTEGER('t', "threads", &params.nthreads, "Specify amount of threads"), 49 52 OPT_UINTEGER('r', "runtime", &params.runtime, "Specify runtime (in seconds)"), 50 53 OPT_BOOLEAN( 'M', "multi", &params.multi, "Use multiple futexes"), ··· 70 67 printf("%sAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n", 71 68 !params.silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg), 72 69 (int)bench__runtime.tv_sec); 70 + futex_print_nbuckets(&params); 73 71 } 74 72 75 73 static void toggle_done(int sig __maybe_unused, ··· 207 203 mutex_init(&thread_lock); 208 204 cond_init(&thread_parent); 209 205 cond_init(&thread_worker); 206 + futex_set_nbuckets_param(&params); 210 207 211 208 threads_starting = params.nthreads; 212 209 gettimeofday(&bench__start, NULL);

+6

tools/perf/bench/futex-requeue.c

··· 42 42 static int futex_flag = 0; 43 43 44 44 static struct bench_futex_parameters params = { 45 + .nbuckets = -1, 45 46 /* 46 47 * How many tasks to requeue at a time. 47 48 * Default to 1 in order to make the kernel work more. ··· 51 50 }; 52 51 53 52 static const struct option options[] = { 53 + OPT_INTEGER( 'b', "buckets", &params.nbuckets, "Specify amount of hash buckets"), 54 + OPT_BOOLEAN( 'I', "immutable", &params.buckets_immutable, "Make the hash buckets immutable"), 54 55 OPT_UINTEGER('t', "threads", &params.nthreads, "Specify amount of threads"), 55 56 OPT_UINTEGER('q', "nrequeue", &params.nrequeue, "Specify amount of threads to requeue at once"), 56 57 OPT_BOOLEAN( 's', "silent", &params.silent, "Silent mode: do not display data/details"), ··· 80 77 params.nthreads, 81 78 requeuetime_avg / USEC_PER_MSEC, 82 79 rel_stddev_stats(requeuetime_stddev, requeuetime_avg)); 80 + futex_print_nbuckets(&params); 83 81 } 84 82 85 83 static void *workerfn(void *arg __maybe_unused) ··· 207 203 208 204 if (params.broadcast) 209 205 params.nrequeue = params.nthreads; 206 + 207 + futex_set_nbuckets_param(&params); 210 208 211 209 printf("Run summary [PID %d]: Requeuing %d threads (from [%s] %p to %s%p), " 212 210 "%d at a time.\n\n", getpid(), params.nthreads,

+8 -1

tools/perf/bench/futex-wake-parallel.c

··· 57 57 static unsigned int threads_starting; 58 58 static int futex_flag = 0; 59 59 60 - static struct bench_futex_parameters params; 60 + static struct bench_futex_parameters params = { 61 + .nbuckets = -1, 62 + }; 61 63 62 64 static const struct option options[] = { 65 + OPT_INTEGER( 'b', "buckets", &params.nbuckets, "Specify amount of hash buckets"), 66 + OPT_BOOLEAN( 'I', "immutable", &params.buckets_immutable, "Make the hash buckets immutable"), 63 67 OPT_UINTEGER('t', "threads", &params.nthreads, "Specify amount of threads"), 64 68 OPT_UINTEGER('w', "nwakers", &params.nwakes, "Specify amount of waking threads"), 65 69 OPT_BOOLEAN( 's', "silent", &params.silent, "Silent mode: do not display data/details"), ··· 222 218 params.nthreads, 223 219 waketime_avg / USEC_PER_MSEC, 224 220 rel_stddev_stats(waketime_stddev, waketime_avg)); 221 + futex_print_nbuckets(&params); 225 222 } 226 223 227 224 ··· 295 290 296 291 if (!params.fshared) 297 292 futex_flag = FUTEX_PRIVATE_FLAG; 293 + 294 + futex_set_nbuckets_param(&params); 298 295 299 296 printf("Run summary [PID %d]: blocking on %d threads (at [%s] " 300 297 "futex %p), %d threads waking up %d at a time.\n\n",

+4

tools/perf/bench/futex-wake.c

··· 42 42 static int futex_flag = 0; 43 43 44 44 static struct bench_futex_parameters params = { 45 + .nbuckets = -1, 45 46 /* 46 47 * How many wakeups to do at a time. 47 48 * Default to 1 in order to make the kernel work more. ··· 51 50 }; 52 51 53 52 static const struct option options[] = { 53 + OPT_INTEGER( 'b', "buckets", &params.nbuckets, "Specify amount of hash buckets"), 54 + OPT_BOOLEAN( 'I', "immutable", &params.buckets_immutable, "Make the hash buckets immutable"), 54 55 OPT_UINTEGER('t', "threads", &params.nthreads, "Specify amount of threads"), 55 56 OPT_UINTEGER('w', "nwakes", &params.nwakes, "Specify amount of threads to wake at once"), 56 57 OPT_BOOLEAN( 's', "silent", &params.silent, "Silent mode: do not display data/details"), ··· 96 93 params.nthreads, 97 94 waketime_avg / USEC_PER_MSEC, 98 95 rel_stddev_stats(waketime_stddev, waketime_avg)); 96 + futex_print_nbuckets(&params); 99 97 } 100 98 101 99 static void block_threads(pthread_t *w, struct perf_cpu_map *cpu)

+67

tools/perf/bench/futex.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <err.h> 3 + #include <stdio.h> 4 + #include <stdlib.h> 5 + #include <linux/prctl.h> 6 + #include <sys/prctl.h> 7 + 8 + #include "futex.h" 9 + 10 + void futex_set_nbuckets_param(struct bench_futex_parameters *params) 11 + { 12 + unsigned long flags; 13 + int ret; 14 + 15 + if (params->nbuckets < 0) 16 + return; 17 + 18 + flags = params->buckets_immutable ? FH_FLAG_IMMUTABLE : 0; 19 + ret = prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_SET_SLOTS, params->nbuckets, flags); 20 + if (ret) { 21 + printf("Requesting %d hash buckets failed: %d/%m\n", 22 + params->nbuckets, ret); 23 + err(EXIT_FAILURE, "prctl(PR_FUTEX_HASH)"); 24 + } 25 + } 26 + 27 + void futex_print_nbuckets(struct bench_futex_parameters *params) 28 + { 29 + char *futex_hash_mode; 30 + int ret; 31 + 32 + ret = prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_SLOTS); 33 + if (params->nbuckets >= 0) { 34 + if (ret != params->nbuckets) { 35 + if (ret < 0) { 36 + printf("Can't query number of buckets: %m\n"); 37 + err(EXIT_FAILURE, "prctl(PR_FUTEX_HASH)"); 38 + } 39 + printf("Requested number of hash buckets does not currently used.\n"); 40 + printf("Requested: %d in usage: %d\n", params->nbuckets, ret); 41 + err(EXIT_FAILURE, "prctl(PR_FUTEX_HASH)"); 42 + } 43 + if (params->nbuckets == 0) { 44 + ret = asprintf(&futex_hash_mode, "Futex hashing: global hash"); 45 + } else { 46 + ret = prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_IMMUTABLE); 47 + if (ret < 0) { 48 + printf("Can't check if the hash is immutable: %m\n"); 49 + err(EXIT_FAILURE, "prctl(PR_FUTEX_HASH)"); 50 + } 51 + ret = asprintf(&futex_hash_mode, "Futex hashing: %d hash buckets %s", 52 + params->nbuckets, 53 + ret == 1 ? "(immutable)" : ""); 54 + } 55 + } else { 56 + if (ret <= 0) { 57 + ret = asprintf(&futex_hash_mode, "Futex hashing: global hash"); 58 + } else { 59 + ret = asprintf(&futex_hash_mode, "Futex hashing: auto resized to %d buckets", 60 + ret); 61 + } 62 + } 63 + if (ret < 0) 64 + err(EXIT_FAILURE, "ENOMEM, futex_hash_mode"); 65 + printf("%s\n", futex_hash_mode); 66 + free(futex_hash_mode); 67 + }

+5

tools/perf/bench/futex.h

··· 25 25 unsigned int nfutexes; 26 26 unsigned int nwakes; 27 27 unsigned int nrequeue; 28 + int nbuckets; 29 + bool buckets_immutable; 28 30 }; 29 31 30 32 /** ··· 144 142 return futex_syscall_nr_requeue(uaddr, FUTEX_CMP_REQUEUE_PI, 1, nr_requeue, uaddr2, 145 143 val, opflags); 146 144 } 145 + 146 + void futex_set_nbuckets_param(struct bench_futex_parameters *params); 147 + void futex_print_nbuckets(struct bench_futex_parameters *params); 147 148 148 149 #endif /* _FUTEX_H */

+4 -2

tools/testing/selftests/futex/functional/.gitignore

··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 + futex_numa_mpol 3 + futex_priv_hash 4 + futex_requeue 2 5 futex_requeue_pi 3 6 futex_requeue_pi_mismatched_ops 4 7 futex_requeue_pi_signal_restart 8 + futex_wait 5 9 futex_wait_private_mapped_file 6 10 futex_wait_timeout 7 11 futex_wait_uninitialized_heap 8 12 futex_wait_wouldblock 9 - futex_wait 10 - futex_requeue 11 13 futex_waitv

+5 -2

tools/testing/selftests/futex/functional/Makefile

··· 1 1 # SPDX-License-Identifier: GPL-2.0 2 2 INCLUDES := -I../include -I../../ $(KHDR_INCLUDES) 3 3 CFLAGS := $(CFLAGS) -g -O2 -Wall -pthread $(INCLUDES) $(KHDR_INCLUDES) 4 - LDLIBS := -lpthread -lrt 4 + LDLIBS := -lpthread -lrt -lnuma 5 5 6 6 LOCAL_HDRS := \ 7 7 ../include/futextest.h \ ··· 17 17 futex_wait_private_mapped_file \ 18 18 futex_wait \ 19 19 futex_requeue \ 20 - futex_waitv 20 + futex_priv_hash \ 21 + futex_numa_mpol \ 22 + futex_waitv \ 23 + futex_numa 21 24 22 25 TEST_PROGS := run.sh 23 26

+262

tools/testing/selftests/futex/functional/futex_numa.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <pthread.h> 4 + #include <sys/shm.h> 5 + #include <sys/mman.h> 6 + #include <fcntl.h> 7 + #include <stdbool.h> 8 + #include <time.h> 9 + #include <assert.h> 10 + #include "logging.h" 11 + #include "futextest.h" 12 + #include "futex2test.h" 13 + 14 + typedef u_int32_t u32; 15 + typedef int32_t s32; 16 + typedef u_int64_t u64; 17 + 18 + static unsigned int fflags = (FUTEX2_SIZE_U32 | FUTEX2_PRIVATE); 19 + static int fnode = FUTEX_NO_NODE; 20 + 21 + /* fairly stupid test-and-set lock with a waiter flag */ 22 + 23 + #define N_LOCK 0x0000001 24 + #define N_WAITERS 0x0001000 25 + 26 + struct futex_numa_32 { 27 + union { 28 + u64 full; 29 + struct { 30 + u32 val; 31 + u32 node; 32 + }; 33 + }; 34 + }; 35 + 36 + void futex_numa_32_lock(struct futex_numa_32 *lock) 37 + { 38 + for (;;) { 39 + struct futex_numa_32 new, old = { 40 + .full = __atomic_load_n(&lock->full, __ATOMIC_RELAXED), 41 + }; 42 + 43 + for (;;) { 44 + new = old; 45 + if (old.val == 0) { 46 + /* no waiter, no lock -> first lock, set no-node */ 47 + new.node = fnode; 48 + } 49 + if (old.val & N_LOCK) { 50 + /* contention, set waiter */ 51 + new.val |= N_WAITERS; 52 + } 53 + new.val |= N_LOCK; 54 + 55 + /* nothing changed, ready to block */ 56 + if (old.full == new.full) 57 + break; 58 + 59 + /* 60 + * Use u64 cmpxchg to set the futex value and node in a 61 + * consistent manner. 62 + */ 63 + if (__atomic_compare_exchange_n(&lock->full, 64 + &old.full, new.full, 65 + /* .weak */ false, 66 + __ATOMIC_ACQUIRE, 67 + __ATOMIC_RELAXED)) { 68 + 69 + /* if we just set N_LOCK, we own it */ 70 + if (!(old.val & N_LOCK)) 71 + return; 72 + 73 + /* go block */ 74 + break; 75 + } 76 + } 77 + 78 + futex2_wait(lock, new.val, fflags, NULL, 0); 79 + } 80 + } 81 + 82 + void futex_numa_32_unlock(struct futex_numa_32 *lock) 83 + { 84 + u32 val = __atomic_sub_fetch(&lock->val, N_LOCK, __ATOMIC_RELEASE); 85 + assert((s32)val >= 0); 86 + if (val & N_WAITERS) { 87 + int woken = futex2_wake(lock, 1, fflags); 88 + assert(val == N_WAITERS); 89 + if (!woken) { 90 + __atomic_compare_exchange_n(&lock->val, &val, 0U, 91 + false, __ATOMIC_RELAXED, 92 + __ATOMIC_RELAXED); 93 + } 94 + } 95 + } 96 + 97 + static long nanos = 50000; 98 + 99 + struct thread_args { 100 + pthread_t tid; 101 + volatile int * done; 102 + struct futex_numa_32 *lock; 103 + int val; 104 + int *val1, *val2; 105 + int node; 106 + }; 107 + 108 + static void *threadfn(void *_arg) 109 + { 110 + struct thread_args *args = _arg; 111 + struct timespec ts = { 112 + .tv_nsec = nanos, 113 + }; 114 + int node; 115 + 116 + while (!*args->done) { 117 + 118 + futex_numa_32_lock(args->lock); 119 + args->val++; 120 + 121 + assert(*args->val1 == *args->val2); 122 + (*args->val1)++; 123 + nanosleep(&ts, NULL); 124 + (*args->val2)++; 125 + 126 + node = args->lock->node; 127 + futex_numa_32_unlock(args->lock); 128 + 129 + if (node != args->node) { 130 + args->node = node; 131 + printf("node: %d\n", node); 132 + } 133 + 134 + nanosleep(&ts, NULL); 135 + } 136 + 137 + return NULL; 138 + } 139 + 140 + static void *contendfn(void *_arg) 141 + { 142 + struct thread_args *args = _arg; 143 + 144 + while (!*args->done) { 145 + /* 146 + * futex2_wait() will take hb-lock, verify *var == val and 147 + * queue/abort. By knowingly setting val 'wrong' this will 148 + * abort and thereby generate hb-lock contention. 149 + */ 150 + futex2_wait(&args->lock->val, ~0U, fflags, NULL, 0); 151 + args->val++; 152 + } 153 + 154 + return NULL; 155 + } 156 + 157 + static volatile int done = 0; 158 + static struct futex_numa_32 lock = { .val = 0, }; 159 + static int val1, val2; 160 + 161 + int main(int argc, char *argv[]) 162 + { 163 + struct thread_args *tas[512], *cas[512]; 164 + int c, t, threads = 2, contenders = 0; 165 + int sleeps = 10; 166 + int total = 0; 167 + 168 + while ((c = getopt(argc, argv, "c:t:s:n:N::")) != -1) { 169 + switch (c) { 170 + case 'c': 171 + contenders = atoi(optarg); 172 + break; 173 + case 't': 174 + threads = atoi(optarg); 175 + break; 176 + case 's': 177 + sleeps = atoi(optarg); 178 + break; 179 + case 'n': 180 + nanos = atoi(optarg); 181 + break; 182 + case 'N': 183 + fflags |= FUTEX2_NUMA; 184 + if (optarg) 185 + fnode = atoi(optarg); 186 + break; 187 + default: 188 + exit(1); 189 + break; 190 + } 191 + } 192 + 193 + for (t = 0; t < contenders; t++) { 194 + struct thread_args *args = calloc(1, sizeof(*args)); 195 + if (!args) { 196 + perror("thread_args"); 197 + exit(-1); 198 + } 199 + 200 + args->done = &done; 201 + args->lock = &lock; 202 + args->val1 = &val1; 203 + args->val2 = &val2; 204 + args->node = -1; 205 + 206 + if (pthread_create(&args->tid, NULL, contendfn, args)) { 207 + perror("pthread_create"); 208 + exit(-1); 209 + } 210 + 211 + cas[t] = args; 212 + } 213 + 214 + for (t = 0; t < threads; t++) { 215 + struct thread_args *args = calloc(1, sizeof(*args)); 216 + if (!args) { 217 + perror("thread_args"); 218 + exit(-1); 219 + } 220 + 221 + args->done = &done; 222 + args->lock = &lock; 223 + args->val1 = &val1; 224 + args->val2 = &val2; 225 + args->node = -1; 226 + 227 + if (pthread_create(&args->tid, NULL, threadfn, args)) { 228 + perror("pthread_create"); 229 + exit(-1); 230 + } 231 + 232 + tas[t] = args; 233 + } 234 + 235 + sleep(sleeps); 236 + 237 + done = true; 238 + 239 + for (t = 0; t < threads; t++) { 240 + struct thread_args *args = tas[t]; 241 + 242 + pthread_join(args->tid, NULL); 243 + total += args->val; 244 + // printf("tval: %d\n", args->val); 245 + } 246 + printf("total: %d\n", total); 247 + 248 + if (contenders) { 249 + total = 0; 250 + for (t = 0; t < contenders; t++) { 251 + struct thread_args *args = cas[t]; 252 + 253 + pthread_join(args->tid, NULL); 254 + total += args->val; 255 + // printf("tval: %d\n", args->val); 256 + } 257 + printf("contenders: %d\n", total); 258 + } 259 + 260 + return 0; 261 + } 262 +

+231

tools/testing/selftests/futex/functional/futex_numa_mpol.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Copyright (C) 2025 Sebastian Andrzej Siewior <bigeasy@linutronix.de> 4 + */ 5 + 6 + #define _GNU_SOURCE 7 + 8 + #include <errno.h> 9 + #include <pthread.h> 10 + #include <stdio.h> 11 + #include <stdlib.h> 12 + #include <unistd.h> 13 + #include <numa.h> 14 + #include <numaif.h> 15 + 16 + #include <linux/futex.h> 17 + #include <sys/mman.h> 18 + 19 + #include "logging.h" 20 + #include "futextest.h" 21 + #include "futex2test.h" 22 + 23 + #define MAX_THREADS 64 24 + 25 + static pthread_barrier_t barrier_main; 26 + static pthread_t threads[MAX_THREADS]; 27 + 28 + struct thread_args { 29 + void *futex_ptr; 30 + unsigned int flags; 31 + int result; 32 + }; 33 + 34 + static struct thread_args thread_args[MAX_THREADS]; 35 + 36 + #ifndef FUTEX_NO_NODE 37 + #define FUTEX_NO_NODE (-1) 38 + #endif 39 + 40 + #ifndef FUTEX2_MPOL 41 + #define FUTEX2_MPOL 0x08 42 + #endif 43 + 44 + static void *thread_lock_fn(void *arg) 45 + { 46 + struct thread_args *args = arg; 47 + int ret; 48 + 49 + pthread_barrier_wait(&barrier_main); 50 + ret = futex2_wait(args->futex_ptr, 0, args->flags, NULL, 0); 51 + args->result = ret; 52 + return NULL; 53 + } 54 + 55 + static void create_max_threads(void *futex_ptr) 56 + { 57 + int i, ret; 58 + 59 + for (i = 0; i < MAX_THREADS; i++) { 60 + thread_args[i].futex_ptr = futex_ptr; 61 + thread_args[i].flags = FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA; 62 + thread_args[i].result = 0; 63 + ret = pthread_create(&threads[i], NULL, thread_lock_fn, &thread_args[i]); 64 + if (ret) 65 + ksft_exit_fail_msg("pthread_create failed\n"); 66 + } 67 + } 68 + 69 + static void join_max_threads(void) 70 + { 71 + int i, ret; 72 + 73 + for (i = 0; i < MAX_THREADS; i++) { 74 + ret = pthread_join(threads[i], NULL); 75 + if (ret) 76 + ksft_exit_fail_msg("pthread_join failed for thread %d\n", i); 77 + } 78 + } 79 + 80 + static void __test_futex(void *futex_ptr, int must_fail, unsigned int futex_flags) 81 + { 82 + int to_wake, ret, i, need_exit = 0; 83 + 84 + pthread_barrier_init(&barrier_main, NULL, MAX_THREADS + 1); 85 + create_max_threads(futex_ptr); 86 + pthread_barrier_wait(&barrier_main); 87 + to_wake = MAX_THREADS; 88 + 89 + do { 90 + ret = futex2_wake(futex_ptr, to_wake, futex_flags); 91 + if (must_fail) { 92 + if (ret < 0) 93 + break; 94 + ksft_exit_fail_msg("futex2_wake(%d, 0x%x) should fail, but didn't\n", 95 + to_wake, futex_flags); 96 + } 97 + if (ret < 0) { 98 + ksft_exit_fail_msg("Failed futex2_wake(%d, 0x%x): %m\n", 99 + to_wake, futex_flags); 100 + } 101 + if (!ret) 102 + usleep(50); 103 + to_wake -= ret; 104 + 105 + } while (to_wake); 106 + join_max_threads(); 107 + 108 + for (i = 0; i < MAX_THREADS; i++) { 109 + if (must_fail && thread_args[i].result != -1) { 110 + ksft_print_msg("Thread %d should fail but succeeded (%d)\n", 111 + i, thread_args[i].result); 112 + need_exit = 1; 113 + } 114 + if (!must_fail && thread_args[i].result != 0) { 115 + ksft_print_msg("Thread %d failed (%d)\n", i, thread_args[i].result); 116 + need_exit = 1; 117 + } 118 + } 119 + if (need_exit) 120 + ksft_exit_fail_msg("Aborting due to earlier errors.\n"); 121 + } 122 + 123 + static void test_futex(void *futex_ptr, int must_fail) 124 + { 125 + __test_futex(futex_ptr, must_fail, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA); 126 + } 127 + 128 + static void test_futex_mpol(void *futex_ptr, int must_fail) 129 + { 130 + __test_futex(futex_ptr, must_fail, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL); 131 + } 132 + 133 + static void usage(char *prog) 134 + { 135 + printf("Usage: %s\n", prog); 136 + printf(" -c Use color\n"); 137 + printf(" -h Display this help message\n"); 138 + printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", 139 + VQUIET, VCRITICAL, VINFO); 140 + } 141 + 142 + int main(int argc, char *argv[]) 143 + { 144 + struct futex32_numa *futex_numa; 145 + int mem_size, i; 146 + void *futex_ptr; 147 + char c; 148 + 149 + while ((c = getopt(argc, argv, "chv:")) != -1) { 150 + switch (c) { 151 + case 'c': 152 + log_color(1); 153 + break; 154 + case 'h': 155 + usage(basename(argv[0])); 156 + exit(0); 157 + break; 158 + case 'v': 159 + log_verbosity(atoi(optarg)); 160 + break; 161 + default: 162 + usage(basename(argv[0])); 163 + exit(1); 164 + } 165 + } 166 + 167 + ksft_print_header(); 168 + ksft_set_plan(1); 169 + 170 + mem_size = sysconf(_SC_PAGE_SIZE); 171 + futex_ptr = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); 172 + if (futex_ptr == MAP_FAILED) 173 + ksft_exit_fail_msg("mmap() for %d bytes failed\n", mem_size); 174 + 175 + futex_numa = futex_ptr; 176 + 177 + ksft_print_msg("Regular test\n"); 178 + futex_numa->futex = 0; 179 + futex_numa->numa = FUTEX_NO_NODE; 180 + test_futex(futex_ptr, 0); 181 + 182 + if (futex_numa->numa == FUTEX_NO_NODE) 183 + ksft_exit_fail_msg("NUMA node is left uninitialized\n"); 184 + 185 + ksft_print_msg("Memory too small\n"); 186 + test_futex(futex_ptr + mem_size - 4, 1); 187 + 188 + ksft_print_msg("Memory out of range\n"); 189 + test_futex(futex_ptr + mem_size, 1); 190 + 191 + futex_numa->numa = FUTEX_NO_NODE; 192 + mprotect(futex_ptr, mem_size, PROT_READ); 193 + ksft_print_msg("Memory, RO\n"); 194 + test_futex(futex_ptr, 1); 195 + 196 + mprotect(futex_ptr, mem_size, PROT_NONE); 197 + ksft_print_msg("Memory, no access\n"); 198 + test_futex(futex_ptr, 1); 199 + 200 + mprotect(futex_ptr, mem_size, PROT_READ | PROT_WRITE); 201 + ksft_print_msg("Memory back to RW\n"); 202 + test_futex(futex_ptr, 0); 203 + 204 + /* MPOL test. Does not work as expected */ 205 + for (i = 0; i < 4; i++) { 206 + unsigned long nodemask; 207 + int ret; 208 + 209 + nodemask = 1 << i; 210 + ret = mbind(futex_ptr, mem_size, MPOL_BIND, &nodemask, 211 + sizeof(nodemask) * 8, 0); 212 + if (ret == 0) { 213 + ksft_print_msg("Node %d test\n", i); 214 + futex_numa->futex = 0; 215 + futex_numa->numa = FUTEX_NO_NODE; 216 + 217 + ret = futex2_wake(futex_ptr, 0, FUTEX2_SIZE_U32 | FUTEX_PRIVATE_FLAG | FUTEX2_NUMA | FUTEX2_MPOL); 218 + if (ret < 0) 219 + ksft_test_result_fail("Failed to wake 0 with MPOL: %m\n"); 220 + if (0) 221 + test_futex_mpol(futex_numa, 0); 222 + if (futex_numa->numa != i) { 223 + ksft_test_result_fail("Returned NUMA node is %d expected %d\n", 224 + futex_numa->numa, i); 225 + } 226 + } 227 + } 228 + ksft_test_result_pass("NUMA MPOL tests passed\n"); 229 + ksft_finished(); 230 + return 0; 231 + }

+292

tools/testing/selftests/futex/functional/futex_priv_hash.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Copyright (C) 2025 Sebastian Andrzej Siewior <bigeasy@linutronix.de> 4 + */ 5 + 6 + #define _GNU_SOURCE 7 + 8 + #include <errno.h> 9 + #include <pthread.h> 10 + #include <stdio.h> 11 + #include <stdlib.h> 12 + #include <unistd.h> 13 + 14 + #include <linux/prctl.h> 15 + #include <sys/prctl.h> 16 + 17 + #include "logging.h" 18 + 19 + #define MAX_THREADS 64 20 + 21 + static pthread_barrier_t barrier_main; 22 + static pthread_mutex_t global_lock; 23 + static pthread_t threads[MAX_THREADS]; 24 + static int counter; 25 + 26 + #ifndef PR_FUTEX_HASH 27 + #define PR_FUTEX_HASH 78 28 + # define PR_FUTEX_HASH_SET_SLOTS 1 29 + # define FH_FLAG_IMMUTABLE (1ULL << 0) 30 + # define PR_FUTEX_HASH_GET_SLOTS 2 31 + # define PR_FUTEX_HASH_GET_IMMUTABLE 3 32 + #endif 33 + 34 + static int futex_hash_slots_set(unsigned int slots, int flags) 35 + { 36 + return prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_SET_SLOTS, slots, flags); 37 + } 38 + 39 + static int futex_hash_slots_get(void) 40 + { 41 + return prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_SLOTS); 42 + } 43 + 44 + static int futex_hash_immutable_get(void) 45 + { 46 + return prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_IMMUTABLE); 47 + } 48 + 49 + static void futex_hash_slots_set_verify(int slots) 50 + { 51 + int ret; 52 + 53 + ret = futex_hash_slots_set(slots, 0); 54 + if (ret != 0) { 55 + ksft_test_result_fail("Failed to set slots to %d: %m\n", slots); 56 + ksft_finished(); 57 + } 58 + ret = futex_hash_slots_get(); 59 + if (ret != slots) { 60 + ksft_test_result_fail("Set %d slots but PR_FUTEX_HASH_GET_SLOTS returns: %d, %m\n", 61 + slots, ret); 62 + ksft_finished(); 63 + } 64 + ksft_test_result_pass("SET and GET slots %d passed\n", slots); 65 + } 66 + 67 + static void futex_hash_slots_set_must_fail(int slots, int flags) 68 + { 69 + int ret; 70 + 71 + ret = futex_hash_slots_set(slots, flags); 72 + ksft_test_result(ret < 0, "futex_hash_slots_set(%d, %d)\n", 73 + slots, flags); 74 + } 75 + 76 + static void *thread_return_fn(void *arg) 77 + { 78 + return NULL; 79 + } 80 + 81 + static void *thread_lock_fn(void *arg) 82 + { 83 + pthread_barrier_wait(&barrier_main); 84 + 85 + pthread_mutex_lock(&global_lock); 86 + counter++; 87 + usleep(20); 88 + pthread_mutex_unlock(&global_lock); 89 + return NULL; 90 + } 91 + 92 + static void create_max_threads(void *(*thread_fn)(void *)) 93 + { 94 + int i, ret; 95 + 96 + for (i = 0; i < MAX_THREADS; i++) { 97 + ret = pthread_create(&threads[i], NULL, thread_fn, NULL); 98 + if (ret) 99 + ksft_exit_fail_msg("pthread_create failed: %m\n"); 100 + } 101 + } 102 + 103 + static void join_max_threads(void) 104 + { 105 + int i, ret; 106 + 107 + for (i = 0; i < MAX_THREADS; i++) { 108 + ret = pthread_join(threads[i], NULL); 109 + if (ret) 110 + ksft_exit_fail_msg("pthread_join failed for thread %d\n", i); 111 + } 112 + } 113 + 114 + static void usage(char *prog) 115 + { 116 + printf("Usage: %s\n", prog); 117 + printf(" -c Use color\n"); 118 + printf(" -g Test global hash instead intead local immutable \n"); 119 + printf(" -h Display this help message\n"); 120 + printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", 121 + VQUIET, VCRITICAL, VINFO); 122 + } 123 + 124 + static const char *test_msg_auto_create = "Automatic hash bucket init on thread creation.\n"; 125 + static const char *test_msg_auto_inc = "Automatic increase with more than 16 CPUs\n"; 126 + 127 + int main(int argc, char *argv[]) 128 + { 129 + int futex_slots1, futex_slotsn, online_cpus; 130 + pthread_mutexattr_t mutex_attr_pi; 131 + int use_global_hash = 0; 132 + int ret; 133 + char c; 134 + 135 + while ((c = getopt(argc, argv, "cghv:")) != -1) { 136 + switch (c) { 137 + case 'c': 138 + log_color(1); 139 + break; 140 + case 'g': 141 + use_global_hash = 1; 142 + break; 143 + case 'h': 144 + usage(basename(argv[0])); 145 + exit(0); 146 + break; 147 + case 'v': 148 + log_verbosity(atoi(optarg)); 149 + break; 150 + default: 151 + usage(basename(argv[0])); 152 + exit(1); 153 + } 154 + } 155 + 156 + ksft_print_header(); 157 + ksft_set_plan(22); 158 + 159 + ret = pthread_mutexattr_init(&mutex_attr_pi); 160 + ret |= pthread_mutexattr_setprotocol(&mutex_attr_pi, PTHREAD_PRIO_INHERIT); 161 + ret |= pthread_mutex_init(&global_lock, &mutex_attr_pi); 162 + if (ret != 0) { 163 + ksft_exit_fail_msg("Failed to initialize pthread mutex.\n"); 164 + } 165 + /* First thread, expect to be 0, not yet initialized */ 166 + ret = futex_hash_slots_get(); 167 + if (ret != 0) 168 + ksft_exit_fail_msg("futex_hash_slots_get() failed: %d, %m\n", ret); 169 + 170 + ret = futex_hash_immutable_get(); 171 + if (ret != 0) 172 + ksft_exit_fail_msg("futex_hash_immutable_get() failed: %d, %m\n", ret); 173 + 174 + ksft_test_result_pass("Basic get slots and immutable status.\n"); 175 + ret = pthread_create(&threads[0], NULL, thread_return_fn, NULL); 176 + if (ret != 0) 177 + ksft_exit_fail_msg("pthread_create() failed: %d, %m\n", ret); 178 + 179 + ret = pthread_join(threads[0], NULL); 180 + if (ret != 0) 181 + ksft_exit_fail_msg("pthread_join() failed: %d, %m\n", ret); 182 + 183 + /* First thread, has to initialiaze private hash */ 184 + futex_slots1 = futex_hash_slots_get(); 185 + if (futex_slots1 <= 0) { 186 + ksft_print_msg("Current hash buckets: %d\n", futex_slots1); 187 + ksft_exit_fail_msg(test_msg_auto_create); 188 + } 189 + 190 + ksft_test_result_pass(test_msg_auto_create); 191 + 192 + online_cpus = sysconf(_SC_NPROCESSORS_ONLN); 193 + ret = pthread_barrier_init(&barrier_main, NULL, MAX_THREADS + 1); 194 + if (ret != 0) 195 + ksft_exit_fail_msg("pthread_barrier_init failed: %m.\n"); 196 + 197 + ret = pthread_mutex_lock(&global_lock); 198 + if (ret != 0) 199 + ksft_exit_fail_msg("pthread_mutex_lock failed: %m.\n"); 200 + 201 + counter = 0; 202 + create_max_threads(thread_lock_fn); 203 + pthread_barrier_wait(&barrier_main); 204 + 205 + /* 206 + * The current default size of hash buckets is 16. The auto increase 207 + * works only if more than 16 CPUs are available. 208 + */ 209 + ksft_print_msg("Online CPUs: %d\n", online_cpus); 210 + if (online_cpus > 16) { 211 + futex_slotsn = futex_hash_slots_get(); 212 + if (futex_slotsn < 0 || futex_slots1 == futex_slotsn) { 213 + ksft_print_msg("Expected increase of hash buckets but got: %d -> %d\n", 214 + futex_slots1, futex_slotsn); 215 + ksft_exit_fail_msg(test_msg_auto_inc); 216 + } 217 + ksft_test_result_pass(test_msg_auto_inc); 218 + } else { 219 + ksft_test_result_skip(test_msg_auto_inc); 220 + } 221 + ret = pthread_mutex_unlock(&global_lock); 222 + 223 + /* Once the user changes it, it has to be what is set */ 224 + futex_hash_slots_set_verify(2); 225 + futex_hash_slots_set_verify(4); 226 + futex_hash_slots_set_verify(8); 227 + futex_hash_slots_set_verify(32); 228 + futex_hash_slots_set_verify(16); 229 + 230 + ret = futex_hash_slots_set(15, 0); 231 + ksft_test_result(ret < 0, "Use 15 slots\n"); 232 + 233 + futex_hash_slots_set_verify(2); 234 + join_max_threads(); 235 + ksft_test_result(counter == MAX_THREADS, "Created of waited for %d of %d threads\n", 236 + counter, MAX_THREADS); 237 + counter = 0; 238 + /* Once the user set something, auto reisze must be disabled */ 239 + ret = pthread_barrier_init(&barrier_main, NULL, MAX_THREADS); 240 + 241 + create_max_threads(thread_lock_fn); 242 + join_max_threads(); 243 + 244 + ret = futex_hash_slots_get(); 245 + ksft_test_result(ret == 2, "No more auto-resize after manaul setting, got %d\n", 246 + ret); 247 + 248 + futex_hash_slots_set_must_fail(1 << 29, 0); 249 + 250 + /* 251 + * Once the private hash has been made immutable or global hash has been requested, 252 + * then this requested can not be undone. 253 + */ 254 + if (use_global_hash) { 255 + ret = futex_hash_slots_set(0, 0); 256 + ksft_test_result(ret == 0, "Global hash request\n"); 257 + } else { 258 + ret = futex_hash_slots_set(4, FH_FLAG_IMMUTABLE); 259 + ksft_test_result(ret == 0, "Immutable resize to 4\n"); 260 + } 261 + if (ret != 0) 262 + goto out; 263 + 264 + futex_hash_slots_set_must_fail(4, 0); 265 + futex_hash_slots_set_must_fail(4, FH_FLAG_IMMUTABLE); 266 + futex_hash_slots_set_must_fail(8, 0); 267 + futex_hash_slots_set_must_fail(8, FH_FLAG_IMMUTABLE); 268 + futex_hash_slots_set_must_fail(0, FH_FLAG_IMMUTABLE); 269 + futex_hash_slots_set_must_fail(6, FH_FLAG_IMMUTABLE); 270 + 271 + ret = pthread_barrier_init(&barrier_main, NULL, MAX_THREADS); 272 + if (ret != 0) { 273 + ksft_exit_fail_msg("pthread_barrier_init failed: %m\n"); 274 + return 1; 275 + } 276 + create_max_threads(thread_lock_fn); 277 + join_max_threads(); 278 + 279 + ret = futex_hash_slots_get(); 280 + if (use_global_hash) { 281 + ksft_test_result(ret == 0, "Continue to use global hash\n"); 282 + } else { 283 + ksft_test_result(ret == 4, "Continue to use the 4 hash buckets\n"); 284 + } 285 + 286 + ret = futex_hash_immutable_get(); 287 + ksft_test_result(ret == 1, "Hash reports to be immutable\n"); 288 + 289 + out: 290 + ksft_finished(); 291 + return 0; 292 + }

+7

tools/testing/selftests/futex/functional/run.sh

··· 82 82 83 83 echo 84 84 ./futex_waitv $COLOR 85 + 86 + echo 87 + ./futex_priv_hash $COLOR 88 + ./futex_priv_hash -g $COLOR 89 + 90 + echo 91 + ./futex_numa_mpol $COLOR

+70

tools/testing/selftests/futex/include/futex2test.h

··· 8 8 9 9 #define u64_to_ptr(x) ((void *)(uintptr_t)(x)) 10 10 11 + #ifndef __NR_futex_waitv 12 + #define __NR_futex_waitv 449 13 + struct futex_waitv { 14 + __u64 val; 15 + __u64 uaddr; 16 + __u32 flags; 17 + __u32 __reserved; 18 + }; 19 + #endif 20 + 21 + #ifndef __NR_futex_wake 22 + #define __NR_futex_wake 454 23 + #endif 24 + 25 + #ifndef __NR_futex_wait 26 + #define __NR_futex_wait 455 27 + #endif 28 + 29 + #ifndef FUTEX2_SIZE_U32 30 + #define FUTEX2_SIZE_U32 0x02 31 + #endif 32 + 33 + #ifndef FUTEX2_NUMA 34 + #define FUTEX2_NUMA 0x04 35 + #endif 36 + 37 + #ifndef FUTEX2_MPOL 38 + #define FUTEX2_MPOL 0x08 39 + #endif 40 + 41 + #ifndef FUTEX2_PRIVATE 42 + #define FUTEX2_PRIVATE FUTEX_PRIVATE_FLAG 43 + #endif 44 + 45 + #ifndef FUTEX2_NO_NODE 46 + #define FUTEX_NO_NODE (-1) 47 + #endif 48 + 49 + #ifndef FUTEX_32 50 + #define FUTEX_32 FUTEX2_SIZE_U32 51 + #endif 52 + 53 + struct futex32_numa { 54 + futex_t futex; 55 + futex_t numa; 56 + }; 57 + 11 58 /** 12 59 * futex_waitv - Wait at multiple futexes, wake on any 13 60 * @waiters: Array of waiters ··· 66 19 unsigned long flags, struct timespec *timo, clockid_t clockid) 67 20 { 68 21 return syscall(__NR_futex_waitv, waiters, nr_waiters, flags, timo, clockid); 22 + } 23 + 24 + /* 25 + * futex_wait() - block on uaddr with optional timeout 26 + * @val: Expected value 27 + * @flags: FUTEX2 flags 28 + * @timeout: Relative timeout 29 + * @clockid: Clock id for the timeout 30 + */ 31 + static inline int futex2_wait(void *uaddr, long val, unsigned int flags, 32 + struct timespec *timeout, clockid_t clockid) 33 + { 34 + return syscall(__NR_futex_wait, uaddr, val, ~0U, flags, timeout, clockid); 35 + } 36 + 37 + /* 38 + * futex2_wake() - Wake a number of futexes 39 + * @nr: Number of threads to wake at most 40 + * @flags: FUTEX2 flags 41 + */ 42 + static inline int futex2_wake(void *uaddr, int nr, unsigned int flags) 43 + { 44 + return syscall(__NR_futex_wake, uaddr, ~0U, nr, flags); 69 45 }

Configure Feed

Configure Feed