Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

futex: Add basic infrastructure for local task local hash

The futex hash is system wide and shared by all tasks. Each slot
is hashed based on futex address and the VMA of the thread. Due to
randomized VMAs (and memory allocations) the same logical lock (pointer)
can end up in a different hash bucket on each invocation of the
application. This in turn means that different applications may share a
hash bucket on the first invocation but not on the second and it is not
always clear which applications will be involved. This can result in
high latency's to acquire the futex_hash_bucket::lock especially if the
lock owner is limited to a CPU and can not be effectively PI boosted.

Introduce basic infrastructure for process local hash which is shared by
all threads of process. This hash will only be used for a
PROCESS_PRIVATE FUTEX operation.

The hashmap can be allocated via:

prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_SET_SLOTS, num);

A `num' of 0 means that the global hash is used instead of a private
hash.
Other values for `num' specify the number of slots for the hash and the
number must be power of two, starting with two.
The prctl() returns zero on success. This function can only be used
before a thread is created.

The current status for the private hash can be queried via:

num = prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_SLOTS);

which return the current number of slots. The value 0 means that the
global hash is used. Values greater than 0 indicate the number of slots
that are used. A negative number indicates an error.

For optimisation, for the private hash jhash2() uses only two arguments
the address and the offset. This omits the VMA which is always the same.

[peterz: Use 0 for global hash. A bit shuffling and renaming. ]

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250416162921.513656-13-bigeasy@linutronix.de

authored by

Sebastian Andrzej Siewior and committed by
Peter Zijlstra
80367ad0 9a9bdfdd

+244 -21
+24 -2
include/linux/futex.h
··· 4 4 5 5 #include <linux/sched.h> 6 6 #include <linux/ktime.h> 7 + #include <linux/mm_types.h> 7 8 8 9 #include <uapi/linux/futex.h> 9 10 10 11 struct inode; 11 - struct mm_struct; 12 12 struct task_struct; 13 13 14 14 /* ··· 77 77 78 78 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 79 79 u32 __user *uaddr2, u32 val2, u32 val3); 80 - #else 80 + int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4); 81 + 82 + #ifdef CONFIG_FUTEX_PRIVATE_HASH 83 + void futex_hash_free(struct mm_struct *mm); 84 + 85 + static inline void futex_mm_init(struct mm_struct *mm) 86 + { 87 + mm->futex_phash = NULL; 88 + } 89 + 90 + #else /* !CONFIG_FUTEX_PRIVATE_HASH */ 91 + static inline void futex_hash_free(struct mm_struct *mm) { } 92 + static inline void futex_mm_init(struct mm_struct *mm) { } 93 + #endif /* CONFIG_FUTEX_PRIVATE_HASH */ 94 + 95 + #else /* !CONFIG_FUTEX */ 81 96 static inline void futex_init_task(struct task_struct *tsk) { } 82 97 static inline void futex_exit_recursive(struct task_struct *tsk) { } 83 98 static inline void futex_exit_release(struct task_struct *tsk) { } ··· 103 88 { 104 89 return -EINVAL; 105 90 } 91 + static inline int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) 92 + { 93 + return -EINVAL; 94 + } 95 + static inline void futex_hash_free(struct mm_struct *mm) { } 96 + static inline void futex_mm_init(struct mm_struct *mm) { } 97 + 106 98 #endif 107 99 108 100 #endif
+4 -1
include/linux/mm_types.h
··· 31 31 #define INIT_PASID 0 32 32 33 33 struct address_space; 34 + struct futex_private_hash; 34 35 struct mem_cgroup; 35 36 36 37 /* ··· 1032 1031 */ 1033 1032 seqcount_t mm_lock_seq; 1034 1033 #endif 1035 - 1034 + #ifdef CONFIG_FUTEX_PRIVATE_HASH 1035 + struct futex_private_hash *futex_phash; 1036 + #endif 1036 1037 1037 1038 unsigned long hiwater_rss; /* High-watermark of RSS usage */ 1038 1039 unsigned long hiwater_vm; /* High-water virtual memory usage */
+5
include/uapi/linux/prctl.h
··· 364 364 # define PR_TIMER_CREATE_RESTORE_IDS_ON 1 365 365 # define PR_TIMER_CREATE_RESTORE_IDS_GET 2 366 366 367 + /* FUTEX hash management */ 368 + #define PR_FUTEX_HASH 78 369 + # define PR_FUTEX_HASH_SET_SLOTS 1 370 + # define PR_FUTEX_HASH_GET_SLOTS 2 371 + 367 372 #endif /* _LINUX_PRCTL_H */
+5
init/Kconfig
··· 1699 1699 depends on FUTEX && RT_MUTEXES 1700 1700 default y 1701 1701 1702 + config FUTEX_PRIVATE_HASH 1703 + bool 1704 + depends on FUTEX && !BASE_SMALL && MMU 1705 + default y 1706 + 1702 1707 config EPOLL 1703 1708 bool "Enable eventpoll support" if EXPERT 1704 1709 default y
+2
kernel/fork.c
··· 1305 1305 RCU_INIT_POINTER(mm->exe_file, NULL); 1306 1306 mmu_notifier_subscriptions_init(mm); 1307 1307 init_tlb_flush_pending(mm); 1308 + futex_mm_init(mm); 1308 1309 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) 1309 1310 mm->pmd_huge_pte = NULL; 1310 1311 #endif ··· 1388 1387 if (mm->binfmt) 1389 1388 module_put(mm->binfmt->module); 1390 1389 lru_gen_del_mm(mm); 1390 + futex_hash_free(mm); 1391 1391 mmdrop(mm); 1392 1392 } 1393 1393
+190 -18
kernel/futex/core.c
··· 39 39 #include <linux/memblock.h> 40 40 #include <linux/fault-inject.h> 41 41 #include <linux/slab.h> 42 + #include <linux/prctl.h> 42 43 43 44 #include "futex.h" 44 45 #include "../locking/rtmutex_common.h" ··· 56 55 #define futex_queues (__futex_data.queues) 57 56 #define futex_hashmask (__futex_data.hashmask) 58 57 58 + struct futex_private_hash { 59 + unsigned int hash_mask; 60 + void *mm; 61 + bool custom; 62 + struct futex_hash_bucket queues[]; 63 + }; 59 64 60 65 /* 61 66 * Fault injections for futexes. ··· 114 107 115 108 #endif /* CONFIG_FAIL_FUTEX */ 116 109 117 - struct futex_private_hash *futex_private_hash(void) 110 + static struct futex_hash_bucket * 111 + __futex_hash(union futex_key *key, struct futex_private_hash *fph); 112 + 113 + #ifdef CONFIG_FUTEX_PRIVATE_HASH 114 + static inline bool futex_key_is_private(union futex_key *key) 118 115 { 119 - return NULL; 116 + /* 117 + * Relies on get_futex_key() to set either bit for shared 118 + * futexes -- see comment with union futex_key. 119 + */ 120 + return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED)); 120 121 } 121 122 122 123 bool futex_private_hash_get(struct futex_private_hash *fph) ··· 132 117 return false; 133 118 } 134 119 135 - void futex_private_hash_put(struct futex_private_hash *fph) { } 136 - 137 - /** 138 - * futex_hash - Return the hash bucket in the global hash 139 - * @key: Pointer to the futex key for which the hash is calculated 140 - * 141 - * We hash on the keys returned from get_futex_key (see below) and return the 142 - * corresponding hash bucket in the global hash. 143 - */ 144 - struct futex_hash_bucket *futex_hash(union futex_key *key) 120 + void futex_private_hash_put(struct futex_private_hash *fph) 145 121 { 146 - u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, 147 - key->both.offset); 148 - 149 - return &futex_queues[hash & futex_hashmask]; 150 122 } 151 123 152 124 /** ··· 145 143 */ 146 144 void futex_hash_get(struct futex_hash_bucket *hb) { } 147 145 void futex_hash_put(struct futex_hash_bucket *hb) { } 146 + 147 + static struct futex_hash_bucket * 148 + __futex_hash_private(union futex_key *key, struct futex_private_hash *fph) 149 + { 150 + u32 hash; 151 + 152 + if (!futex_key_is_private(key)) 153 + return NULL; 154 + 155 + if (!fph) 156 + fph = key->private.mm->futex_phash; 157 + if (!fph || !fph->hash_mask) 158 + return NULL; 159 + 160 + hash = jhash2((void *)&key->private.address, 161 + sizeof(key->private.address) / 4, 162 + key->both.offset); 163 + return &fph->queues[hash & fph->hash_mask]; 164 + } 165 + 166 + struct futex_private_hash *futex_private_hash(void) 167 + { 168 + struct mm_struct *mm = current->mm; 169 + struct futex_private_hash *fph; 170 + 171 + fph = mm->futex_phash; 172 + return fph; 173 + } 174 + 175 + struct futex_hash_bucket *futex_hash(union futex_key *key) 176 + { 177 + struct futex_hash_bucket *hb; 178 + 179 + hb = __futex_hash(key, NULL); 180 + return hb; 181 + } 182 + 183 + #else /* !CONFIG_FUTEX_PRIVATE_HASH */ 184 + 185 + static struct futex_hash_bucket * 186 + __futex_hash_private(union futex_key *key, struct futex_private_hash *fph) 187 + { 188 + return NULL; 189 + } 190 + 191 + struct futex_hash_bucket *futex_hash(union futex_key *key) 192 + { 193 + return __futex_hash(key, NULL); 194 + } 195 + 196 + #endif /* CONFIG_FUTEX_PRIVATE_HASH */ 197 + 198 + /** 199 + * __futex_hash - Return the hash bucket 200 + * @key: Pointer to the futex key for which the hash is calculated 201 + * @fph: Pointer to private hash if known 202 + * 203 + * We hash on the keys returned from get_futex_key (see below) and return the 204 + * corresponding hash bucket. 205 + * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the 206 + * private hash) is returned if existing. Otherwise a hash bucket from the 207 + * global hash is returned. 208 + */ 209 + static struct futex_hash_bucket * 210 + __futex_hash(union futex_key *key, struct futex_private_hash *fph) 211 + { 212 + struct futex_hash_bucket *hb; 213 + u32 hash; 214 + 215 + hb = __futex_hash_private(key, fph); 216 + if (hb) 217 + return hb; 218 + 219 + hash = jhash2((u32 *)key, 220 + offsetof(typeof(*key), both.offset) / 4, 221 + key->both.offset); 222 + return &futex_queues[hash & futex_hashmask]; 223 + } 148 224 149 225 /** 150 226 * futex_setup_timer - set up the sleeping hrtimer. ··· 1066 986 union futex_key key = FUTEX_KEY_INIT; 1067 987 1068 988 /* 989 + * Ensure the hash remains stable (no resize) during the while loop 990 + * below. The hb pointer is acquired under the pi_lock so we can't block 991 + * on the mutex. 992 + */ 993 + WARN_ON(curr != current); 994 + guard(private_hash)(); 995 + /* 1069 996 * We are a ZOMBIE and nobody can enqueue itself on 1070 997 * pi_state_list anymore, but we have to be careful 1071 998 * versus waiters unqueueing themselves: ··· 1247 1160 futex_cleanup_end(tsk, FUTEX_STATE_DEAD); 1248 1161 } 1249 1162 1250 - static void futex_hash_bucket_init(struct futex_hash_bucket *fhb) 1163 + static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, 1164 + struct futex_private_hash *fph) 1251 1165 { 1166 + #ifdef CONFIG_FUTEX_PRIVATE_HASH 1167 + fhb->priv = fph; 1168 + #endif 1252 1169 atomic_set(&fhb->waiters, 0); 1253 1170 plist_head_init(&fhb->chain); 1254 1171 spin_lock_init(&fhb->lock); 1172 + } 1173 + 1174 + #ifdef CONFIG_FUTEX_PRIVATE_HASH 1175 + void futex_hash_free(struct mm_struct *mm) 1176 + { 1177 + kvfree(mm->futex_phash); 1178 + } 1179 + 1180 + static int futex_hash_allocate(unsigned int hash_slots, bool custom) 1181 + { 1182 + struct mm_struct *mm = current->mm; 1183 + struct futex_private_hash *fph; 1184 + int i; 1185 + 1186 + if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots))) 1187 + return -EINVAL; 1188 + 1189 + if (mm->futex_phash) 1190 + return -EALREADY; 1191 + 1192 + if (!thread_group_empty(current)) 1193 + return -EINVAL; 1194 + 1195 + fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 1196 + if (!fph) 1197 + return -ENOMEM; 1198 + 1199 + fph->hash_mask = hash_slots ? hash_slots - 1 : 0; 1200 + fph->custom = custom; 1201 + fph->mm = mm; 1202 + 1203 + for (i = 0; i < hash_slots; i++) 1204 + futex_hash_bucket_init(&fph->queues[i], fph); 1205 + 1206 + mm->futex_phash = fph; 1207 + return 0; 1208 + } 1209 + 1210 + static int futex_hash_get_slots(void) 1211 + { 1212 + struct futex_private_hash *fph; 1213 + 1214 + fph = current->mm->futex_phash; 1215 + if (fph && fph->hash_mask) 1216 + return fph->hash_mask + 1; 1217 + return 0; 1218 + } 1219 + 1220 + #else 1221 + 1222 + static int futex_hash_allocate(unsigned int hash_slots, bool custom) 1223 + { 1224 + return -EINVAL; 1225 + } 1226 + 1227 + static int futex_hash_get_slots(void) 1228 + { 1229 + return 0; 1230 + } 1231 + #endif 1232 + 1233 + int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) 1234 + { 1235 + int ret; 1236 + 1237 + switch (arg2) { 1238 + case PR_FUTEX_HASH_SET_SLOTS: 1239 + if (arg4 != 0) 1240 + return -EINVAL; 1241 + ret = futex_hash_allocate(arg3, true); 1242 + break; 1243 + 1244 + case PR_FUTEX_HASH_GET_SLOTS: 1245 + ret = futex_hash_get_slots(); 1246 + break; 1247 + 1248 + default: 1249 + ret = -EINVAL; 1250 + break; 1251 + } 1252 + return ret; 1255 1253 } 1256 1254 1257 1255 static int __init futex_init(void) ··· 1357 1185 hashsize = 1UL << futex_shift; 1358 1186 1359 1187 for (i = 0; i < hashsize; i++) 1360 - futex_hash_bucket_init(&futex_queues[i]); 1188 + futex_hash_bucket_init(&futex_queues[i], NULL); 1361 1189 1362 1190 futex_hashmask = hashsize - 1; 1363 1191 return 0;
+10
kernel/futex/futex.h
··· 118 118 atomic_t waiters; 119 119 spinlock_t lock; 120 120 struct plist_head chain; 121 + struct futex_private_hash *priv; 121 122 } ____cacheline_aligned_in_smp; 122 123 123 124 /* ··· 205 204 int flags, u64 range_ns); 206 205 207 206 extern struct futex_hash_bucket *futex_hash(union futex_key *key); 207 + #ifdef CONFIG_FUTEX_PRIVATE_HASH 208 208 extern void futex_hash_get(struct futex_hash_bucket *hb); 209 209 extern void futex_hash_put(struct futex_hash_bucket *hb); 210 210 211 211 extern struct futex_private_hash *futex_private_hash(void); 212 212 extern bool futex_private_hash_get(struct futex_private_hash *fph); 213 213 extern void futex_private_hash_put(struct futex_private_hash *fph); 214 + 215 + #else /* !CONFIG_FUTEX_PRIVATE_HASH */ 216 + static inline void futex_hash_get(struct futex_hash_bucket *hb) { } 217 + static inline void futex_hash_put(struct futex_hash_bucket *hb) { } 218 + static inline struct futex_private_hash *futex_private_hash(void) { return NULL; } 219 + static inline bool futex_private_hash_get(void) { return false; } 220 + static inline void futex_private_hash_put(struct futex_private_hash *fph) { } 221 + #endif 214 222 215 223 DEFINE_CLASS(hb, struct futex_hash_bucket *, 216 224 if (_T) futex_hash_put(_T),
+4
kernel/sys.c
··· 52 52 #include <linux/user_namespace.h> 53 53 #include <linux/time_namespace.h> 54 54 #include <linux/binfmts.h> 55 + #include <linux/futex.h> 55 56 56 57 #include <linux/sched.h> 57 58 #include <linux/sched/autogroup.h> ··· 2820 2819 if (arg3 || arg4 || arg5) 2821 2820 return -EINVAL; 2822 2821 error = posixtimer_create_prctl(arg2); 2822 + break; 2823 + case PR_FUTEX_HASH: 2824 + error = futex_hash_prctl(arg2, arg3, arg4); 2823 2825 break; 2824 2826 default: 2825 2827 trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);