execmem: rework execmem_cache_free()

Currently execmem_cache_free() ignores potential allocation failures that
may happen in execmem_cache_add(). Besides, it uses text poking to fill
the memory with trapping instructions before returning it to cache
although it would be more efficient to make that memory writable, update
it using memcpy and then restore ROX protection.

Rework execmem_cache_free() so that in case of an error it will defer
freeing of the memory to a delayed work.

With this the happy fast path will now change permissions to RW, fill the
memory with trapping instructions using memcpy, restore ROX permissions,
add the memory back to the free cache and clear the relevant entry in
busy_areas.

If any step in the fast path fails, the entry in busy_areas will be marked
as pending_free. These entries will be handled by a delayed work and
freed asynchronously.

To make the fast path faster, use __GFP_NORETRY for memory allocations and
let asynchronous handler try harder with GFP_KERNEL.

Link: https://lkml.kernel.org/r/20250713071730.4117334-4-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Daniel Gomez <da.gomez@samsung.com>
Cc: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Petr Pavlu <petr.pavlu@suse.com>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Mike Rapoport (Microsoft) and committed by

Andrew Morton 10 months ago 187fd852 838955f6

+102 -23

1 changed file

expand all

execmem.c

+102 -23

mm/execmem.c

··· 93 93 struct mutex mutex; 94 94 struct maple_tree busy_areas; 95 95 struct maple_tree free_areas; 96 + unsigned int pending_free_cnt; /* protected by mutex */ 96 97 }; 98 + 99 + /* delay to schedule asynchronous free if fast path free fails */ 100 + #define FREE_DELAY (msecs_to_jiffies(10)) 101 + 102 + /* mark entries in busy_areas that should be freed asynchronously */ 103 + #define PENDING_FREE_MASK (1 << (PAGE_SHIFT - 1)) 97 104 98 105 static struct execmem_cache execmem_cache = { 99 106 .mutex = __MUTEX_INITIALIZER(execmem_cache.mutex), ··· 162 155 163 156 static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean); 164 157 165 - static int execmem_cache_add(void *ptr, size_t size) 158 + static int execmem_cache_add_locked(void *ptr, size_t size, gfp_t gfp_mask) 166 159 { 167 160 struct maple_tree *free_areas = &execmem_cache.free_areas; 168 - struct mutex *mutex = &execmem_cache.mutex; 169 161 unsigned long addr = (unsigned long)ptr; 170 162 MA_STATE(mas, free_areas, addr - 1, addr + 1); 171 163 unsigned long lower, upper; 172 164 void *area = NULL; 173 - int err; 174 165 175 166 lower = addr; 176 167 upper = addr + size - 1; 177 168 178 - mutex_lock(mutex); 179 169 area = mas_walk(&mas); 180 170 if (area && mas.last == addr - 1) 181 171 lower = mas.index; ··· 182 178 upper = mas.last; 183 179 184 180 mas_set_range(&mas, lower, upper); 185 - err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL); 186 - mutex_unlock(mutex); 187 - if (err) 188 - return err; 181 + return mas_store_gfp(&mas, (void *)lower, gfp_mask); 182 + } 189 183 190 - return 0; 184 + static int execmem_cache_add(void *ptr, size_t size, gfp_t gfp_mask) 185 + { 186 + guard(mutex)(&execmem_cache.mutex); 187 + 188 + return execmem_cache_add_locked(ptr, size, gfp_mask); 191 189 } 192 190 193 191 static bool within_range(struct execmem_range *range, struct ma_state *mas, ··· 284 278 if (err) 285 279 goto err_free_mem; 286 280 287 - err = execmem_cache_add(p, alloc_size); 281 + err = execmem_cache_add(p, alloc_size, GFP_KERNEL); 288 282 if (err) 289 283 goto err_reset_direct_map; 290 284 ··· 313 307 return __execmem_cache_alloc(range, size); 314 308 } 315 309 310 + static inline bool is_pending_free(void *ptr) 311 + { 312 + return ((unsigned long)ptr & PENDING_FREE_MASK); 313 + } 314 + 315 + static inline void *pending_free_set(void *ptr) 316 + { 317 + return (void *)((unsigned long)ptr | PENDING_FREE_MASK); 318 + } 319 + 320 + static inline void *pending_free_clear(void *ptr) 321 + { 322 + return (void *)((unsigned long)ptr & ~PENDING_FREE_MASK); 323 + } 324 + 325 + static int execmem_force_rw(void *ptr, size_t size); 326 + 327 + static int __execmem_cache_free(struct ma_state *mas, void *ptr, gfp_t gfp_mask) 328 + { 329 + size_t size = mas_range_len(mas); 330 + int err; 331 + 332 + err = execmem_force_rw(ptr, size); 333 + if (err) 334 + return err; 335 + 336 + execmem_fill_trapping_insns(ptr, size, /* writable = */ true); 337 + execmem_restore_rox(ptr, size); 338 + 339 + err = execmem_cache_add_locked(ptr, size, gfp_mask); 340 + if (err) 341 + return err; 342 + 343 + mas_store_gfp(mas, NULL, gfp_mask); 344 + return 0; 345 + } 346 + 347 + static void execmem_cache_free_slow(struct work_struct *work); 348 + static DECLARE_DELAYED_WORK(execmem_cache_free_work, execmem_cache_free_slow); 349 + 350 + static void execmem_cache_free_slow(struct work_struct *work) 351 + { 352 + struct maple_tree *busy_areas = &execmem_cache.busy_areas; 353 + MA_STATE(mas, busy_areas, 0, ULONG_MAX); 354 + void *area; 355 + 356 + guard(mutex)(&execmem_cache.mutex); 357 + 358 + if (!execmem_cache.pending_free_cnt) 359 + return; 360 + 361 + mas_for_each(&mas, area, ULONG_MAX) { 362 + if (!is_pending_free(area)) 363 + continue; 364 + 365 + area = pending_free_clear(area); 366 + if (__execmem_cache_free(&mas, area, GFP_KERNEL)) 367 + continue; 368 + 369 + execmem_cache.pending_free_cnt--; 370 + } 371 + 372 + if (execmem_cache.pending_free_cnt) 373 + schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); 374 + else 375 + schedule_work(&execmem_cache_clean_work); 376 + } 377 + 316 378 static bool execmem_cache_free(void *ptr) 317 379 { 318 380 struct maple_tree *busy_areas = &execmem_cache.busy_areas; 319 - struct mutex *mutex = &execmem_cache.mutex; 320 381 unsigned long addr = (unsigned long)ptr; 321 382 MA_STATE(mas, busy_areas, addr, addr); 322 - size_t size; 323 383 void *area; 384 + int err; 324 385 325 - mutex_lock(mutex); 386 + guard(mutex)(&execmem_cache.mutex); 387 + 326 388 area = mas_walk(&mas); 327 - if (!area) { 328 - mutex_unlock(mutex); 389 + if (!area) 329 390 return false; 391 + 392 + err = __execmem_cache_free(&mas, area, GFP_KERNEL | __GFP_NORETRY); 393 + if (err) { 394 + /* 395 + * mas points to exact slot we've got the area from, nothing 396 + * else can modify the tree because of the mutex, so there 397 + * won't be any allocations in mas_store_gfp() and it will just 398 + * change the pointer. 399 + */ 400 + area = pending_free_set(area); 401 + mas_store_gfp(&mas, area, GFP_KERNEL); 402 + execmem_cache.pending_free_cnt++; 403 + schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); 404 + return true; 330 405 } 331 - size = mas_range_len(&mas); 332 - 333 - mas_store_gfp(&mas, NULL, GFP_KERNEL); 334 - mutex_unlock(mutex); 335 - 336 - execmem_fill_trapping_insns(ptr, size, /* writable = */ false); 337 - 338 - execmem_cache_add(ptr, size); 339 406 340 407 schedule_work(&execmem_cache_clean_work); 341 408

Configure Feed

Configure Feed