Merge tag 'slab-for-6.12' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pull slab updates from Vlastimil Babka:
"This time it's mostly refactoring and improving APIs for slab users in
the kernel, along with some debugging improvements.

- kmem_cache_create() refactoring (Christian Brauner)

Over the years have been growing new parameters to
kmem_cache_create() where most of them are needed only for a small
number of caches - most recently the rcu_freeptr_offset parameter.

To avoid adding new parameters to kmem_cache_create() and adjusting
all its callers, or creating new wrappers such as
kmem_cache_create_rcu(), we can now pass extra parameters using the
new struct kmem_cache_args. Not explicitly initialized fields
default to values interpreted as unused.

kmem_cache_create() is for now a wrapper that works both with the
new form: kmem_cache_create(name, object_size, args, flags) and the
legacy form: kmem_cache_create(name, object_size, align, flags,
ctor)

- kmem_cache_destroy() waits for kfree_rcu()'s in flight (Vlastimil
Babka, Uladislau Rezki)

Since SLOB removal, kfree() is allowed for freeing objects
allocated by kmem_cache_create(). By extension kfree_rcu() as
allowed as well, which can allow converting simple call_rcu()
callbacks that only do kmem_cache_free(), as there was never a
kmem_cache_free_rcu() variant. However, for caches that can be
destroyed e.g. on module removal, the cache owners knew to issue
rcu_barrier() first to wait for the pending call_rcu()'s, and this
is not sufficient for pending kfree_rcu()'s due to its internal
batching optimizations. Ulad has provided a new
kvfree_rcu_barrier() and to make the usage less error-prone,
kmem_cache_destroy() calls it. Additionally, destroying
SLAB_TYPESAFE_BY_RCU caches now again issues rcu_barrier()
synchronously instead of using an async work, because the past
motivation for async work no longer applies. Users of custom
call_rcu() callbacks should however keep calling rcu_barrier()
before cache destruction.

- Debugging use-after-free in SLAB_TYPESAFE_BY_RCU caches (Jann Horn)

Currently, KASAN cannot catch UAFs in such caches as it is legal to
access them within a grace period, and we only track the grace
period when trying to free the underlying slab page. The new
CONFIG_SLUB_RCU_DEBUG option changes the freeing of individual
object to be RCU-delayed, after which KASAN can poison them.

- Delayed memcg charging (Shakeel Butt)

In some cases, the memcg is uknown at allocation time, such as
receiving network packets in softirq context. With
kmem_cache_charge() these may be now charged later when the user
and its memcg is known.

- Misc fixes and improvements (Pedro Falcato, Axel Rasmussen,
Christoph Lameter, Yan Zhen, Peng Fan, Xavier)"

* tag 'slab-for-6.12' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab: (34 commits)
mm, slab: restore kerneldoc for kmem_cache_create()
io_uring: port to struct kmem_cache_args
slab: make __kmem_cache_create() static inline
slab: make kmem_cache_create_usercopy() static inline
slab: remove kmem_cache_create_rcu()
file: port to struct kmem_cache_args
slab: create kmem_cache_create() compatibility layer
slab: port KMEM_CACHE_USERCOPY() to struct kmem_cache_args
slab: port KMEM_CACHE() to struct kmem_cache_args
slab: remove rcu_freeptr_offset from struct kmem_cache
slab: pass struct kmem_cache_args to do_kmem_cache_create()
slab: pull kmem_cache_open() into do_kmem_cache_create()
slab: pass struct kmem_cache_args to create_cache()
slab: port kmem_cache_create_usercopy() to struct kmem_cache_args
slab: port kmem_cache_create_rcu() to struct kmem_cache_args
slab: port kmem_cache_create() to struct kmem_cache_args
slab: add struct kmem_cache_args
slab: s/__kmem_cache_create/do_kmem_cache_create/g
memcg: add charging of already allocated slab objects
mm/slab: Optimize the code logic in find_mergeable()
...

Linus Torvalds 2 years ago bdf56c75 efdfcd40

+935 -453

15 changed files

expand all

file_table.c

include

linux

kasan.h

rcutiny.h

rcutree.h

slab.h

io_uring

io_uring.c

kernel

rcu

tree.c

lib

slub_kunit.c

Kconfig.debug

kasan

common.c

kasan_test.c

slab.h

slab_common.c

slub.c

net

ipv4

inet_connection_sock.c

+8 -3

fs/file_table.c

··· 521 521 522 522 void __init files_init(void) 523 523 { 524 - filp_cachep = kmem_cache_create_rcu("filp", sizeof(struct file), 525 - offsetof(struct file, f_freeptr), 526 - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); 524 + struct kmem_cache_args args = { 525 + .use_freeptr_offset = true, 526 + .freeptr_offset = offsetof(struct file, f_freeptr), 527 + }; 528 + 529 + filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args, 530 + SLAB_HWCACHE_ALIGN | SLAB_PANIC | 531 + SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); 527 532 percpu_counter_init(&nr_files, 0, GFP_KERNEL); 528 533 } 529 534

+59 -6

include/linux/kasan.h

··· 175 175 return (void *)object; 176 176 } 177 177 178 - bool __kasan_slab_free(struct kmem_cache *s, void *object, 179 - unsigned long ip, bool init); 180 - static __always_inline bool kasan_slab_free(struct kmem_cache *s, 181 - void *object, bool init) 178 + bool __kasan_slab_pre_free(struct kmem_cache *s, void *object, 179 + unsigned long ip); 180 + /** 181 + * kasan_slab_pre_free - Check whether freeing a slab object is safe. 182 + * @object: Object to be freed. 183 + * 184 + * This function checks whether freeing the given object is safe. It may 185 + * check for double-free and invalid-free bugs and report them. 186 + * 187 + * This function is intended only for use by the slab allocator. 188 + * 189 + * @Return true if freeing the object is unsafe; false otherwise. 190 + */ 191 + static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s, 192 + void *object) 182 193 { 183 194 if (kasan_enabled()) 184 - return __kasan_slab_free(s, object, _RET_IP_, init); 195 + return __kasan_slab_pre_free(s, object, _RET_IP_); 196 + return false; 197 + } 198 + 199 + bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init, 200 + bool still_accessible); 201 + /** 202 + * kasan_slab_free - Poison, initialize, and quarantine a slab object. 203 + * @object: Object to be freed. 204 + * @init: Whether to initialize the object. 205 + * @still_accessible: Whether the object contents are still accessible. 206 + * 207 + * This function informs that a slab object has been freed and is not 208 + * supposed to be accessed anymore, except when @still_accessible is set 209 + * (indicating that the object is in a SLAB_TYPESAFE_BY_RCU cache and an RCU 210 + * grace period might not have passed yet). 211 + * 212 + * For KASAN modes that have integrated memory initialization 213 + * (kasan_has_integrated_init() == true), this function also initializes 214 + * the object's memory. For other modes, the @init argument is ignored. 215 + * 216 + * This function might also take ownership of the object to quarantine it. 217 + * When this happens, KASAN will defer freeing the object to a later 218 + * stage and handle it internally until then. The return value indicates 219 + * whether KASAN took ownership of the object. 220 + * 221 + * This function is intended only for use by the slab allocator. 222 + * 223 + * @Return true if KASAN took ownership of the object; false otherwise. 224 + */ 225 + static __always_inline bool kasan_slab_free(struct kmem_cache *s, 226 + void *object, bool init, 227 + bool still_accessible) 228 + { 229 + if (kasan_enabled()) 230 + return __kasan_slab_free(s, object, init, still_accessible); 185 231 return false; 186 232 } 187 233 ··· 417 371 { 418 372 return (void *)object; 419 373 } 420 - static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init) 374 + 375 + static inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object) 376 + { 377 + return false; 378 + } 379 + 380 + static inline bool kasan_slab_free(struct kmem_cache *s, void *object, 381 + bool init, bool still_accessible) 421 382 { 422 383 return false; 423 384 }

include/linux/rcutiny.h

··· 111 111 kvfree(ptr); 112 112 } 113 113 114 + static inline void kvfree_rcu_barrier(void) 115 + { 116 + rcu_barrier(); 117 + } 118 + 114 119 #ifdef CONFIG_KASAN_GENERIC 115 120 void kvfree_call_rcu(struct rcu_head *head, void *ptr); 116 121 #else

include/linux/rcutree.h

··· 35 35 36 36 void synchronize_rcu_expedited(void); 37 37 void kvfree_call_rcu(struct rcu_head *head, void *ptr); 38 + void kvfree_rcu_barrier(void); 38 39 39 40 void rcu_barrier(void); 40 41 void rcu_momentary_eqs(void);

+208 -20

include/linux/slab.h

··· 240 240 */ 241 241 bool slab_is_available(void); 242 242 243 - struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, 244 - unsigned int align, slab_flags_t flags, 245 - void (*ctor)(void *)); 246 - struct kmem_cache *kmem_cache_create_usercopy(const char *name, 247 - unsigned int size, unsigned int align, 248 - slab_flags_t flags, 249 - unsigned int useroffset, unsigned int usersize, 250 - void (*ctor)(void *)); 251 - struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, 252 - unsigned int freeptr_offset, 253 - slab_flags_t flags); 243 + /** 244 + * struct kmem_cache_args - Less common arguments for kmem_cache_create() 245 + * 246 + * Any uninitialized fields of the structure are interpreted as unused. The 247 + * exception is @freeptr_offset where %0 is a valid value, so 248 + * @use_freeptr_offset must be also set to %true in order to interpret the field 249 + * as used. For @useroffset %0 is also valid, but only with non-%0 250 + * @usersize. 251 + * 252 + * When %NULL args is passed to kmem_cache_create(), it is equivalent to all 253 + * fields unused. 254 + */ 255 + struct kmem_cache_args { 256 + /** 257 + * @align: The required alignment for the objects. 258 + * 259 + * %0 means no specific alignment is requested. 260 + */ 261 + unsigned int align; 262 + /** 263 + * @useroffset: Usercopy region offset. 264 + * 265 + * %0 is a valid offset, when @usersize is non-%0 266 + */ 267 + unsigned int useroffset; 268 + /** 269 + * @usersize: Usercopy region size. 270 + * 271 + * %0 means no usercopy region is specified. 272 + */ 273 + unsigned int usersize; 274 + /** 275 + * @freeptr_offset: Custom offset for the free pointer 276 + * in &SLAB_TYPESAFE_BY_RCU caches 277 + * 278 + * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer 279 + * outside of the object. This might cause the object to grow in size. 280 + * Cache creators that have a reason to avoid this can specify a custom 281 + * free pointer offset in their struct where the free pointer will be 282 + * placed. 283 + * 284 + * Note that placing the free pointer inside the object requires the 285 + * caller to ensure that no fields are invalidated that are required to 286 + * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for 287 + * details). 288 + * 289 + * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset 290 + * is specified, %use_freeptr_offset must be set %true. 291 + * 292 + * Note that @ctor currently isn't supported with custom free pointers 293 + * as a @ctor requires an external free pointer. 294 + */ 295 + unsigned int freeptr_offset; 296 + /** 297 + * @use_freeptr_offset: Whether a @freeptr_offset is used. 298 + */ 299 + bool use_freeptr_offset; 300 + /** 301 + * @ctor: A constructor for the objects. 302 + * 303 + * The constructor is invoked for each object in a newly allocated slab 304 + * page. It is the cache user's responsibility to free object in the 305 + * same state as after calling the constructor, or deal appropriately 306 + * with any differences between a freshly constructed and a reallocated 307 + * object. 308 + * 309 + * %NULL means no constructor. 310 + */ 311 + void (*ctor)(void *); 312 + }; 313 + 314 + struct kmem_cache *__kmem_cache_create_args(const char *name, 315 + unsigned int object_size, 316 + struct kmem_cache_args *args, 317 + slab_flags_t flags); 318 + static inline struct kmem_cache * 319 + __kmem_cache_create(const char *name, unsigned int size, unsigned int align, 320 + slab_flags_t flags, void (*ctor)(void *)) 321 + { 322 + struct kmem_cache_args kmem_args = { 323 + .align = align, 324 + .ctor = ctor, 325 + }; 326 + 327 + return __kmem_cache_create_args(name, size, &kmem_args, flags); 328 + } 329 + 330 + /** 331 + * kmem_cache_create_usercopy - Create a kmem cache with a region suitable 332 + * for copying to userspace. 333 + * @name: A string which is used in /proc/slabinfo to identify this cache. 334 + * @size: The size of objects to be created in this cache. 335 + * @align: The required alignment for the objects. 336 + * @flags: SLAB flags 337 + * @useroffset: Usercopy region offset 338 + * @usersize: Usercopy region size 339 + * @ctor: A constructor for the objects, or %NULL. 340 + * 341 + * This is a legacy wrapper, new code should use either KMEM_CACHE_USERCOPY() 342 + * if whitelisting a single field is sufficient, or kmem_cache_create() with 343 + * the necessary parameters passed via the args parameter (see 344 + * &struct kmem_cache_args) 345 + * 346 + * Return: a pointer to the cache on success, NULL on failure. 347 + */ 348 + static inline struct kmem_cache * 349 + kmem_cache_create_usercopy(const char *name, unsigned int size, 350 + unsigned int align, slab_flags_t flags, 351 + unsigned int useroffset, unsigned int usersize, 352 + void (*ctor)(void *)) 353 + { 354 + struct kmem_cache_args kmem_args = { 355 + .align = align, 356 + .ctor = ctor, 357 + .useroffset = useroffset, 358 + .usersize = usersize, 359 + }; 360 + 361 + return __kmem_cache_create_args(name, size, &kmem_args, flags); 362 + } 363 + 364 + /* If NULL is passed for @args, use this variant with default arguments. */ 365 + static inline struct kmem_cache * 366 + __kmem_cache_default_args(const char *name, unsigned int size, 367 + struct kmem_cache_args *args, 368 + slab_flags_t flags) 369 + { 370 + struct kmem_cache_args kmem_default_args = {}; 371 + 372 + /* Make sure we don't get passed garbage. */ 373 + if (WARN_ON_ONCE(args)) 374 + return ERR_PTR(-EINVAL); 375 + 376 + return __kmem_cache_create_args(name, size, &kmem_default_args, flags); 377 + } 378 + 379 + /** 380 + * kmem_cache_create - Create a kmem cache. 381 + * @__name: A string which is used in /proc/slabinfo to identify this cache. 382 + * @__object_size: The size of objects to be created in this cache. 383 + * @__args: Optional arguments, see &struct kmem_cache_args. Passing %NULL 384 + * means defaults will be used for all the arguments. 385 + * 386 + * This is currently implemented as a macro using ``_Generic()`` to call 387 + * either the new variant of the function, or a legacy one. 388 + * 389 + * The new variant has 4 parameters: 390 + * ``kmem_cache_create(name, object_size, args, flags)`` 391 + * 392 + * See __kmem_cache_create_args() which implements this. 393 + * 394 + * The legacy variant has 5 parameters: 395 + * ``kmem_cache_create(name, object_size, align, flags, ctor)`` 396 + * 397 + * The align and ctor parameters map to the respective fields of 398 + * &struct kmem_cache_args 399 + * 400 + * Context: Cannot be called within a interrupt, but can be interrupted. 401 + * 402 + * Return: a pointer to the cache on success, NULL on failure. 403 + */ 404 + #define kmem_cache_create(__name, __object_size, __args, ...) \ 405 + _Generic((__args), \ 406 + struct kmem_cache_args *: __kmem_cache_create_args, \ 407 + void *: __kmem_cache_default_args, \ 408 + default: __kmem_cache_create)(__name, __object_size, __args, __VA_ARGS__) 409 + 254 410 void kmem_cache_destroy(struct kmem_cache *s); 255 411 int kmem_cache_shrink(struct kmem_cache *s); 256 412 ··· 418 262 * f.e. add ____cacheline_aligned_in_smp to the struct declaration 419 263 * then the objects will be properly aligned in SMP configurations. 420 264 */ 421 - #define KMEM_CACHE(__struct, __flags) \ 422 - kmem_cache_create(#__struct, sizeof(struct __struct), \ 423 - __alignof__(struct __struct), (__flags), NULL) 265 + #define KMEM_CACHE(__struct, __flags) \ 266 + __kmem_cache_create_args(#__struct, sizeof(struct __struct), \ 267 + &(struct kmem_cache_args) { \ 268 + .align = __alignof__(struct __struct), \ 269 + }, (__flags)) 424 270 425 271 /* 426 272 * To whitelist a single field for copying to/from usercopy, use this 427 273 * macro instead for KMEM_CACHE() above. 428 274 */ 429 - #define KMEM_CACHE_USERCOPY(__struct, __flags, __field) \ 430 - kmem_cache_create_usercopy(#__struct, \ 431 - sizeof(struct __struct), \ 432 - __alignof__(struct __struct), (__flags), \ 433 - offsetof(struct __struct, __field), \ 434 - sizeof_field(struct __struct, __field), NULL) 275 + #define KMEM_CACHE_USERCOPY(__struct, __flags, __field) \ 276 + __kmem_cache_create_args(#__struct, sizeof(struct __struct), \ 277 + &(struct kmem_cache_args) { \ 278 + .align = __alignof__(struct __struct), \ 279 + .useroffset = offsetof(struct __struct, __field), \ 280 + .usersize = sizeof_field(struct __struct, __field), \ 281 + }, (__flags)) 435 282 436 283 /* 437 284 * Common kmalloc functions provided by all allocators ··· 715 556 gfp_t gfpflags) __assume_slab_alignment __malloc; 716 557 #define kmem_cache_alloc_lru(...) alloc_hooks(kmem_cache_alloc_lru_noprof(__VA_ARGS__)) 717 558 559 + /** 560 + * kmem_cache_charge - memcg charge an already allocated slab memory 561 + * @objp: address of the slab object to memcg charge 562 + * @gfpflags: describe the allocation context 563 + * 564 + * kmem_cache_charge allows charging a slab object to the current memcg, 565 + * primarily in cases where charging at allocation time might not be possible 566 + * because the target memcg is not known (i.e. softirq context) 567 + * 568 + * The objp should be pointer returned by the slab allocator functions like 569 + * kmalloc (with __GFP_ACCOUNT in flags) or kmem_cache_alloc. The memcg charge 570 + * behavior can be controlled through gfpflags parameter, which affects how the 571 + * necessary internal metadata can be allocated. Including __GFP_NOFAIL denotes 572 + * that overcharging is requested instead of failure, but is not applied for the 573 + * internal metadata allocation. 574 + * 575 + * There are several cases where it will return true even if the charging was 576 + * not done: 577 + * More specifically: 578 + * 579 + * 1. For !CONFIG_MEMCG or cgroup_disable=memory systems. 580 + * 2. Already charged slab objects. 581 + * 3. For slab objects from KMALLOC_NORMAL caches - allocated by kmalloc() 582 + * without __GFP_ACCOUNT 583 + * 4. Allocating internal metadata has failed 584 + * 585 + * Return: true if charge was successful otherwise false. 586 + */ 587 + bool kmem_cache_charge(void *objp, gfp_t gfpflags); 718 588 void kmem_cache_free(struct kmem_cache *s, void *objp); 719 589 720 590 kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags,

+8 -6

io_uring/io_uring.c

··· 3755 3755 3756 3756 static int __init io_uring_init(void) 3757 3757 { 3758 + struct kmem_cache_args kmem_args = { 3759 + .useroffset = offsetof(struct io_kiocb, cmd.data), 3760 + .usersize = sizeof_field(struct io_kiocb, cmd.data), 3761 + }; 3762 + 3758 3763 #define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \ 3759 3764 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ 3760 3765 BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \ ··· 3844 3839 * range, and HARDENED_USERCOPY will complain if we haven't 3845 3840 * correctly annotated this range. 3846 3841 */ 3847 - req_cachep = kmem_cache_create_usercopy("io_kiocb", 3848 - sizeof(struct io_kiocb), 0, 3849 - SLAB_HWCACHE_ALIGN | SLAB_PANIC | 3850 - SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, 3851 - offsetof(struct io_kiocb, cmd.data), 3852 - sizeof_field(struct io_kiocb, cmd.data), NULL); 3842 + req_cachep = kmem_cache_create("io_kiocb", sizeof(struct io_kiocb), &kmem_args, 3843 + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | 3844 + SLAB_TYPESAFE_BY_RCU); 3853 3845 io_buf_cachep = KMEM_CACHE(io_buffer, 3854 3846 SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); 3855 3847

+101 -8

kernel/rcu/tree.c

··· 3564 3564 } 3565 3565 3566 3566 /* 3567 - * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. 3567 + * Return: %true if a work is queued, %false otherwise. 3568 3568 */ 3569 - static void kfree_rcu_monitor(struct work_struct *work) 3569 + static bool 3570 + kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) 3570 3571 { 3571 - struct kfree_rcu_cpu *krcp = container_of(work, 3572 - struct kfree_rcu_cpu, monitor_work.work); 3573 3572 unsigned long flags; 3573 + bool queued = false; 3574 3574 int i, j; 3575 - 3576 - // Drain ready for reclaim. 3577 - kvfree_rcu_drain_ready(krcp); 3578 3575 3579 3576 raw_spin_lock_irqsave(&krcp->lock, flags); 3580 3577 ··· 3611 3614 // be that the work is in the pending state when 3612 3615 // channels have been detached following by each 3613 3616 // other. 3614 - queue_rcu_work(system_unbound_wq, &krwp->rcu_work); 3617 + queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work); 3615 3618 } 3616 3619 } 3617 3620 3618 3621 raw_spin_unlock_irqrestore(&krcp->lock, flags); 3622 + return queued; 3623 + } 3624 + 3625 + /* 3626 + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. 3627 + */ 3628 + static void kfree_rcu_monitor(struct work_struct *work) 3629 + { 3630 + struct kfree_rcu_cpu *krcp = container_of(work, 3631 + struct kfree_rcu_cpu, monitor_work.work); 3632 + 3633 + // Drain ready for reclaim. 3634 + kvfree_rcu_drain_ready(krcp); 3635 + 3636 + // Queue a batch for a rest. 3637 + kvfree_rcu_queue_batch(krcp); 3619 3638 3620 3639 // If there is nothing to detach, it means that our job is 3621 3640 // successfully done here. In case of having at least one ··· 3852 3839 } 3853 3840 } 3854 3841 EXPORT_SYMBOL_GPL(kvfree_call_rcu); 3842 + 3843 + /** 3844 + * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete. 3845 + * 3846 + * Note that a single argument of kvfree_rcu() call has a slow path that 3847 + * triggers synchronize_rcu() following by freeing a pointer. It is done 3848 + * before the return from the function. Therefore for any single-argument 3849 + * call that will result in a kfree() to a cache that is to be destroyed 3850 + * during module exit, it is developer's responsibility to ensure that all 3851 + * such calls have returned before the call to kmem_cache_destroy(). 3852 + */ 3853 + void kvfree_rcu_barrier(void) 3854 + { 3855 + struct kfree_rcu_cpu_work *krwp; 3856 + struct kfree_rcu_cpu *krcp; 3857 + bool queued; 3858 + int i, cpu; 3859 + 3860 + /* 3861 + * Firstly we detach objects and queue them over an RCU-batch 3862 + * for all CPUs. Finally queued works are flushed for each CPU. 3863 + * 3864 + * Please note. If there are outstanding batches for a particular 3865 + * CPU, those have to be finished first following by queuing a new. 3866 + */ 3867 + for_each_possible_cpu(cpu) { 3868 + krcp = per_cpu_ptr(&krc, cpu); 3869 + 3870 + /* 3871 + * Check if this CPU has any objects which have been queued for a 3872 + * new GP completion. If not(means nothing to detach), we are done 3873 + * with it. If any batch is pending/running for this "krcp", below 3874 + * per-cpu flush_rcu_work() waits its completion(see last step). 3875 + */ 3876 + if (!need_offload_krc(krcp)) 3877 + continue; 3878 + 3879 + while (1) { 3880 + /* 3881 + * If we are not able to queue a new RCU work it means: 3882 + * - batches for this CPU are still in flight which should 3883 + * be flushed first and then repeat; 3884 + * - no objects to detach, because of concurrency. 3885 + */ 3886 + queued = kvfree_rcu_queue_batch(krcp); 3887 + 3888 + /* 3889 + * Bail out, if there is no need to offload this "krcp" 3890 + * anymore. As noted earlier it can run concurrently. 3891 + */ 3892 + if (queued || !need_offload_krc(krcp)) 3893 + break; 3894 + 3895 + /* There are ongoing batches. */ 3896 + for (i = 0; i < KFREE_N_BATCHES; i++) { 3897 + krwp = &(krcp->krw_arr[i]); 3898 + flush_rcu_work(&krwp->rcu_work); 3899 + } 3900 + } 3901 + } 3902 + 3903 + /* 3904 + * Now we guarantee that all objects are flushed. 3905 + */ 3906 + for_each_possible_cpu(cpu) { 3907 + krcp = per_cpu_ptr(&krc, cpu); 3908 + 3909 + /* 3910 + * A monitor work can drain ready to reclaim objects 3911 + * directly. Wait its completion if running or pending. 3912 + */ 3913 + cancel_delayed_work_sync(&krcp->monitor_work); 3914 + 3915 + for (i = 0; i < KFREE_N_BATCHES; i++) { 3916 + krwp = &(krcp->krw_arr[i]); 3917 + flush_rcu_work(&krwp->rcu_work); 3918 + } 3919 + } 3920 + } 3921 + EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); 3855 3922 3856 3923 static unsigned long 3857 3924 kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)

+31

lib/slub_kunit.c

··· 5 5 #include <linux/slab.h> 6 6 #include <linux/module.h> 7 7 #include <linux/kernel.h> 8 + #include <linux/rcupdate.h> 8 9 #include "../mm/slab.h" 9 10 10 11 static struct kunit_resource resource; ··· 158 157 kmem_cache_destroy(s); 159 158 } 160 159 160 + struct test_kfree_rcu_struct { 161 + struct rcu_head rcu; 162 + }; 163 + 164 + static void test_kfree_rcu(struct kunit *test) 165 + { 166 + struct kmem_cache *s = test_kmem_cache_create("TestSlub_kfree_rcu", 167 + sizeof(struct test_kfree_rcu_struct), 168 + SLAB_NO_MERGE); 169 + struct test_kfree_rcu_struct *p = kmem_cache_alloc(s, GFP_KERNEL); 170 + 171 + kfree_rcu(p, rcu); 172 + kmem_cache_destroy(s); 173 + 174 + KUNIT_EXPECT_EQ(test, 0, slab_errors); 175 + } 176 + 177 + static void test_leak_destroy(struct kunit *test) 178 + { 179 + struct kmem_cache *s = test_kmem_cache_create("TestSlub_kfree_rcu", 180 + 64, SLAB_NO_MERGE); 181 + kmem_cache_alloc(s, GFP_KERNEL); 182 + 183 + kmem_cache_destroy(s); 184 + 185 + KUNIT_EXPECT_EQ(test, 1, slab_errors); 186 + } 187 + 161 188 static int test_init(struct kunit *test) 162 189 { 163 190 slab_errors = 0; ··· 206 177 207 178 KUNIT_CASE(test_clobber_redzone_free), 208 179 KUNIT_CASE(test_kmalloc_redzone_access), 180 + KUNIT_CASE(test_kfree_rcu), 181 + KUNIT_CASE(test_leak_destroy), 209 182 {} 210 183 }; 211 184

+32

mm/Kconfig.debug

··· 70 70 off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying 71 71 "slab_debug=-". 72 72 73 + config SLUB_RCU_DEBUG 74 + bool "Enable UAF detection in TYPESAFE_BY_RCU caches (for KASAN)" 75 + depends on SLUB_DEBUG 76 + # SLUB_RCU_DEBUG should build fine without KASAN, but is currently useless 77 + # without KASAN, so mark it as a dependency of KASAN for now. 78 + depends on KASAN 79 + default KASAN_GENERIC || KASAN_SW_TAGS 80 + help 81 + Make SLAB_TYPESAFE_BY_RCU caches behave approximately as if the cache 82 + was not marked as SLAB_TYPESAFE_BY_RCU and every caller used 83 + kfree_rcu() instead. 84 + 85 + This is intended for use in combination with KASAN, to enable KASAN to 86 + detect use-after-free accesses in such caches. 87 + (KFENCE is able to do that independent of this flag.) 88 + 89 + This might degrade performance. 90 + Unfortunately this also prevents a very specific bug pattern from 91 + triggering (insufficient checks against an object being recycled 92 + within the RCU grace period); so this option can be turned off even on 93 + KASAN builds, in case you want to test for such a bug. 94 + 95 + If you're using this for testing bugs / fuzzing and care about 96 + catching all the bugs WAY more than performance, you might want to 97 + also turn on CONFIG_RCU_STRICT_GRACE_PERIOD. 98 + 99 + WARNING: 100 + This is designed as a debugging feature, not a security feature. 101 + Objects are sometimes recycled without RCU delay under memory pressure. 102 + 103 + If unsure, say N. 104 + 73 105 config PAGE_OWNER 74 106 bool "Track page owner" 75 107 depends on DEBUG_KERNEL && STACKTRACE_SUPPORT

+38 -26

mm/kasan/common.c

··· 208 208 return (void *)object; 209 209 } 210 210 211 - static inline bool poison_slab_object(struct kmem_cache *cache, void *object, 212 - unsigned long ip, bool init) 211 + /* Returns true when freeing the object is not safe. */ 212 + static bool check_slab_allocation(struct kmem_cache *cache, void *object, 213 + unsigned long ip) 213 214 { 214 - void *tagged_object; 215 + void *tagged_object = object; 215 216 216 - if (!kasan_arch_is_ready()) 217 - return false; 218 - 219 - tagged_object = object; 220 217 object = kasan_reset_tag(object); 221 218 222 219 if (unlikely(nearest_obj(cache, virt_to_slab(object), object) != object)) { ··· 221 224 return true; 222 225 } 223 226 224 - /* RCU slabs could be legally used after free within the RCU period. */ 225 - if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) 226 - return false; 227 - 228 227 if (!kasan_byte_accessible(tagged_object)) { 229 228 kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_DOUBLE_FREE); 230 229 return true; 231 230 } 231 + 232 + return false; 233 + } 234 + 235 + static inline void poison_slab_object(struct kmem_cache *cache, void *object, 236 + bool init, bool still_accessible) 237 + { 238 + void *tagged_object = object; 239 + 240 + object = kasan_reset_tag(object); 241 + 242 + /* RCU slabs could be legally used after free within the RCU period. */ 243 + if (unlikely(still_accessible)) 244 + return; 232 245 233 246 kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), 234 247 KASAN_SLAB_FREE, init); 235 248 236 249 if (kasan_stack_collection_enabled()) 237 250 kasan_save_free_info(cache, tagged_object); 238 - 239 - return false; 240 251 } 241 252 242 - bool __kasan_slab_free(struct kmem_cache *cache, void *object, 243 - unsigned long ip, bool init) 253 + bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object, 254 + unsigned long ip) 244 255 { 245 - if (is_kfence_address(object)) 256 + if (!kasan_arch_is_ready() || is_kfence_address(object)) 257 + return false; 258 + return check_slab_allocation(cache, object, ip); 259 + } 260 + 261 + bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, 262 + bool still_accessible) 263 + { 264 + if (!kasan_arch_is_ready() || is_kfence_address(object)) 246 265 return false; 247 266 248 - /* 249 - * If the object is buggy, do not let slab put the object onto the 250 - * freelist. The object will thus never be allocated again and its 251 - * metadata will never get released. 252 - */ 253 - if (poison_slab_object(cache, object, ip, init)) 254 - return true; 267 + poison_slab_object(cache, object, init, still_accessible); 255 268 256 269 /* 257 270 * If the object is put into quarantine, do not let slab put the object ··· 511 504 return true; 512 505 } 513 506 514 - if (is_kfence_address(ptr)) 515 - return false; 507 + if (is_kfence_address(ptr) || !kasan_arch_is_ready()) 508 + return true; 516 509 517 510 slab = folio_slab(folio); 518 - return !poison_slab_object(slab->slab_cache, ptr, ip, false); 511 + 512 + if (check_slab_allocation(slab->slab_cache, ptr, ip)) 513 + return false; 514 + 515 + poison_slab_object(slab->slab_cache, ptr, false, false); 516 + return true; 519 517 } 520 518 521 519 void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip)

+46

mm/kasan/kasan_test.c

··· 996 996 kmem_cache_destroy(cache); 997 997 } 998 998 999 + static void kmem_cache_rcu_uaf(struct kunit *test) 1000 + { 1001 + char *p; 1002 + size_t size = 200; 1003 + struct kmem_cache *cache; 1004 + 1005 + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB_RCU_DEBUG); 1006 + 1007 + cache = kmem_cache_create("test_cache", size, 0, SLAB_TYPESAFE_BY_RCU, 1008 + NULL); 1009 + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); 1010 + 1011 + p = kmem_cache_alloc(cache, GFP_KERNEL); 1012 + if (!p) { 1013 + kunit_err(test, "Allocation failed: %s\n", __func__); 1014 + kmem_cache_destroy(cache); 1015 + return; 1016 + } 1017 + *p = 1; 1018 + 1019 + rcu_read_lock(); 1020 + 1021 + /* Free the object - this will internally schedule an RCU callback. */ 1022 + kmem_cache_free(cache, p); 1023 + 1024 + /* 1025 + * We should still be allowed to access the object at this point because 1026 + * the cache is SLAB_TYPESAFE_BY_RCU and we've been in an RCU read-side 1027 + * critical section since before the kmem_cache_free(). 1028 + */ 1029 + READ_ONCE(*p); 1030 + 1031 + rcu_read_unlock(); 1032 + 1033 + /* 1034 + * Wait for the RCU callback to execute; after this, the object should 1035 + * have actually been freed from KASAN's perspective. 1036 + */ 1037 + rcu_barrier(); 1038 + 1039 + KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*p)); 1040 + 1041 + kmem_cache_destroy(cache); 1042 + } 1043 + 999 1044 static void empty_cache_ctor(void *object) { } 1000 1045 1001 1046 static void kmem_cache_double_destroy(struct kunit *test) ··· 1982 1937 KUNIT_CASE(kmem_cache_oob), 1983 1938 KUNIT_CASE(kmem_cache_double_free), 1984 1939 KUNIT_CASE(kmem_cache_invalid_free), 1940 + KUNIT_CASE(kmem_cache_rcu_uaf), 1985 1941 KUNIT_CASE(kmem_cache_double_destroy), 1986 1942 KUNIT_CASE(kmem_cache_accounted), 1987 1943 KUNIT_CASE(kmem_cache_bulk),

+10 -3

mm/slab.h

··· 261 261 unsigned int object_size; /* Object size without metadata */ 262 262 struct reciprocal_value reciprocal_size; 263 263 unsigned int offset; /* Free pointer offset */ 264 - /* Specific free pointer requested (if not UINT_MAX) */ 265 - unsigned int rcu_freeptr_offset; 266 264 #ifdef CONFIG_SLUB_CPU_PARTIAL 267 265 /* Number of per cpu partial objects to keep around */ 268 266 unsigned int cpu_partial; ··· 422 424 gfp_t kmalloc_fix_flags(gfp_t flags); 423 425 424 426 /* Functions provided by the slab allocators */ 425 - int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); 427 + int do_kmem_cache_create(struct kmem_cache *s, const char *name, 428 + unsigned int size, struct kmem_cache_args *args, 429 + slab_flags_t flags); 426 430 427 431 void __init kmem_cache_init(void); 428 432 extern void create_boot_cache(struct kmem_cache *, const char *name, ··· 443 443 static inline bool is_kmalloc_cache(struct kmem_cache *s) 444 444 { 445 445 return (s->flags & SLAB_KMALLOC); 446 + } 447 + 448 + static inline bool is_kmalloc_normal(struct kmem_cache *s) 449 + { 450 + if (!is_kmalloc_cache(s)) 451 + return false; 452 + return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT)); 446 453 } 447 454 448 455 /* Legal flag mask for kmem_cache_create(), for various configurations */

+114 -238

mm/slab_common.c

··· 40 40 DEFINE_MUTEX(slab_mutex); 41 41 struct kmem_cache *kmem_cache; 42 42 43 - static LIST_HEAD(slab_caches_to_rcu_destroy); 44 - static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work); 45 - static DECLARE_WORK(slab_caches_to_rcu_destroy_work, 46 - slab_caches_to_rcu_destroy_workfn); 47 - 48 43 /* 49 44 * Set of flags that will prevent slab merging 50 45 */ ··· 83 88 EXPORT_SYMBOL(kmem_cache_size); 84 89 85 90 #ifdef CONFIG_DEBUG_VM 91 + 92 + static bool kmem_cache_is_duplicate_name(const char *name) 93 + { 94 + struct kmem_cache *s; 95 + 96 + list_for_each_entry(s, &slab_caches, list) { 97 + if (!strcmp(s->name, name)) 98 + return true; 99 + } 100 + 101 + return false; 102 + } 103 + 86 104 static int kmem_cache_sanity_check(const char *name, unsigned int size) 87 105 { 88 106 if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) { 89 107 pr_err("kmem_cache_create(%s) integrity check failed\n", name); 90 108 return -EINVAL; 91 109 } 110 + 111 + /* Duplicate names will confuse slabtop, et al */ 112 + WARN(kmem_cache_is_duplicate_name(name), 113 + "kmem_cache of name '%s' already exists\n", name); 92 114 93 115 WARN_ON(strchr(name, ' ')); /* It confuses parsers */ 94 116 return 0; ··· 181 169 if (ctor) 182 170 return NULL; 183 171 184 - size = ALIGN(size, sizeof(void *)); 185 - align = calculate_alignment(flags, align, size); 186 - size = ALIGN(size, align); 187 172 flags = kmem_cache_flags(flags, name); 188 173 189 174 if (flags & SLAB_NEVER_MERGE) 190 175 return NULL; 176 + 177 + size = ALIGN(size, sizeof(void *)); 178 + align = calculate_alignment(flags, align, size); 179 + size = ALIGN(size, align); 191 180 192 181 list_for_each_entry_reverse(s, &slab_caches, list) { 193 182 if (slab_unmergeable(s)) ··· 215 202 } 216 203 217 204 static struct kmem_cache *create_cache(const char *name, 218 - unsigned int object_size, unsigned int freeptr_offset, 219 - unsigned int align, slab_flags_t flags, 220 - unsigned int useroffset, unsigned int usersize, 221 - void (*ctor)(void *)) 205 + unsigned int object_size, 206 + struct kmem_cache_args *args, 207 + slab_flags_t flags) 222 208 { 223 209 struct kmem_cache *s; 224 210 int err; 225 211 226 - if (WARN_ON(useroffset + usersize > object_size)) 227 - useroffset = usersize = 0; 212 + if (WARN_ON(args->useroffset + args->usersize > object_size)) 213 + args->useroffset = args->usersize = 0; 228 214 229 215 /* If a custom freelist pointer is requested make sure it's sane. */ 230 216 err = -EINVAL; 231 - if (freeptr_offset != UINT_MAX && 232 - (freeptr_offset >= object_size || !(flags & SLAB_TYPESAFE_BY_RCU) || 233 - !IS_ALIGNED(freeptr_offset, sizeof(freeptr_t)))) 217 + if (args->use_freeptr_offset && 218 + (args->freeptr_offset >= object_size || 219 + !(flags & SLAB_TYPESAFE_BY_RCU) || 220 + !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t)))) 234 221 goto out; 235 222 236 223 err = -ENOMEM; 237 224 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); 238 225 if (!s) 239 226 goto out; 240 - 241 - s->name = name; 242 - s->size = s->object_size = object_size; 243 - s->rcu_freeptr_offset = freeptr_offset; 244 - s->align = align; 245 - s->ctor = ctor; 246 - #ifdef CONFIG_HARDENED_USERCOPY 247 - s->useroffset = useroffset; 248 - s->usersize = usersize; 249 - #endif 250 - err = __kmem_cache_create(s, flags); 227 + err = do_kmem_cache_create(s, name, object_size, args, flags); 251 228 if (err) 252 229 goto out_free_cache; 253 230 ··· 251 248 return ERR_PTR(err); 252 249 } 253 250 254 - static struct kmem_cache * 255 - do_kmem_cache_create_usercopy(const char *name, 256 - unsigned int size, unsigned int freeptr_offset, 257 - unsigned int align, slab_flags_t flags, 258 - unsigned int useroffset, unsigned int usersize, 259 - void (*ctor)(void *)) 251 + /** 252 + * __kmem_cache_create_args - Create a kmem cache. 253 + * @name: A string which is used in /proc/slabinfo to identify this cache. 254 + * @object_size: The size of objects to be created in this cache. 255 + * @args: Additional arguments for the cache creation (see 256 + * &struct kmem_cache_args). 257 + * @flags: See %SLAB_* flags for an explanation of individual @flags. 258 + * 259 + * Not to be called directly, use the kmem_cache_create() wrapper with the same 260 + * parameters. 261 + * 262 + * Context: Cannot be called within a interrupt, but can be interrupted. 263 + * 264 + * Return: a pointer to the cache on success, NULL on failure. 265 + */ 266 + struct kmem_cache *__kmem_cache_create_args(const char *name, 267 + unsigned int object_size, 268 + struct kmem_cache_args *args, 269 + slab_flags_t flags) 260 270 { 261 271 struct kmem_cache *s = NULL; 262 272 const char *cache_name; ··· 291 275 292 276 mutex_lock(&slab_mutex); 293 277 294 - err = kmem_cache_sanity_check(name, size); 278 + err = kmem_cache_sanity_check(name, object_size); 295 279 if (err) { 296 280 goto out_unlock; 297 281 } ··· 312 296 313 297 /* Fail closed on bad usersize of useroffset values. */ 314 298 if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) || 315 - WARN_ON(!usersize && useroffset) || 316 - WARN_ON(size < usersize || size - usersize < useroffset)) 317 - usersize = useroffset = 0; 299 + WARN_ON(!args->usersize && args->useroffset) || 300 + WARN_ON(object_size < args->usersize || 301 + object_size - args->usersize < args->useroffset)) 302 + args->usersize = args->useroffset = 0; 318 303 319 - if (!usersize) 320 - s = __kmem_cache_alias(name, size, align, flags, ctor); 304 + if (!args->usersize) 305 + s = __kmem_cache_alias(name, object_size, args->align, flags, 306 + args->ctor); 321 307 if (s) 322 308 goto out_unlock; 323 309 ··· 329 311 goto out_unlock; 330 312 } 331 313 332 - s = create_cache(cache_name, size, freeptr_offset, 333 - calculate_alignment(flags, align, size), 334 - flags, useroffset, usersize, ctor); 314 + args->align = calculate_alignment(flags, args->align, object_size); 315 + s = create_cache(cache_name, object_size, args, flags); 335 316 if (IS_ERR(s)) { 336 317 err = PTR_ERR(s); 337 318 kfree_const(cache_name); ··· 352 335 } 353 336 return s; 354 337 } 355 - 356 - /** 357 - * kmem_cache_create_usercopy - Create a cache with a region suitable 358 - * for copying to userspace 359 - * @name: A string which is used in /proc/slabinfo to identify this cache. 360 - * @size: The size of objects to be created in this cache. 361 - * @align: The required alignment for the objects. 362 - * @flags: SLAB flags 363 - * @useroffset: Usercopy region offset 364 - * @usersize: Usercopy region size 365 - * @ctor: A constructor for the objects. 366 - * 367 - * Cannot be called within a interrupt, but can be interrupted. 368 - * The @ctor is run when new pages are allocated by the cache. 369 - * 370 - * The flags are 371 - * 372 - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 373 - * to catch references to uninitialised memory. 374 - * 375 - * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check 376 - * for buffer overruns. 377 - * 378 - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 379 - * cacheline. This can be beneficial if you're counting cycles as closely 380 - * as davem. 381 - * 382 - * Return: a pointer to the cache on success, NULL on failure. 383 - */ 384 - struct kmem_cache * 385 - kmem_cache_create_usercopy(const char *name, unsigned int size, 386 - unsigned int align, slab_flags_t flags, 387 - unsigned int useroffset, unsigned int usersize, 388 - void (*ctor)(void *)) 389 - { 390 - return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags, 391 - useroffset, usersize, ctor); 392 - } 393 - EXPORT_SYMBOL(kmem_cache_create_usercopy); 394 - 395 - /** 396 - * kmem_cache_create - Create a cache. 397 - * @name: A string which is used in /proc/slabinfo to identify this cache. 398 - * @size: The size of objects to be created in this cache. 399 - * @align: The required alignment for the objects. 400 - * @flags: SLAB flags 401 - * @ctor: A constructor for the objects. 402 - * 403 - * Cannot be called within a interrupt, but can be interrupted. 404 - * The @ctor is run when new pages are allocated by the cache. 405 - * 406 - * The flags are 407 - * 408 - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 409 - * to catch references to uninitialised memory. 410 - * 411 - * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check 412 - * for buffer overruns. 413 - * 414 - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 415 - * cacheline. This can be beneficial if you're counting cycles as closely 416 - * as davem. 417 - * 418 - * Return: a pointer to the cache on success, NULL on failure. 419 - */ 420 - struct kmem_cache * 421 - kmem_cache_create(const char *name, unsigned int size, unsigned int align, 422 - slab_flags_t flags, void (*ctor)(void *)) 423 - { 424 - return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags, 425 - 0, 0, ctor); 426 - } 427 - EXPORT_SYMBOL(kmem_cache_create); 428 - 429 - /** 430 - * kmem_cache_create_rcu - Create a SLAB_TYPESAFE_BY_RCU cache. 431 - * @name: A string which is used in /proc/slabinfo to identify this cache. 432 - * @size: The size of objects to be created in this cache. 433 - * @freeptr_offset: The offset into the memory to the free pointer 434 - * @flags: SLAB flags 435 - * 436 - * Cannot be called within an interrupt, but can be interrupted. 437 - * 438 - * See kmem_cache_create() for an explanation of possible @flags. 439 - * 440 - * By default SLAB_TYPESAFE_BY_RCU caches place the free pointer outside 441 - * of the object. This might cause the object to grow in size. Callers 442 - * that have a reason to avoid this can specify a custom free pointer 443 - * offset in their struct where the free pointer will be placed. 444 - * 445 - * Note that placing the free pointer inside the object requires the 446 - * caller to ensure that no fields are invalidated that are required to 447 - * guard against object recycling (See SLAB_TYPESAFE_BY_RCU for 448 - * details.). 449 - * 450 - * Using zero as a value for @freeptr_offset is valid. To request no 451 - * offset UINT_MAX must be specified. 452 - * 453 - * Note that @ctor isn't supported with custom free pointers as a @ctor 454 - * requires an external free pointer. 455 - * 456 - * Return: a pointer to the cache on success, NULL on failure. 457 - */ 458 - struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, 459 - unsigned int freeptr_offset, 460 - slab_flags_t flags) 461 - { 462 - return do_kmem_cache_create_usercopy(name, size, freeptr_offset, 0, 463 - flags | SLAB_TYPESAFE_BY_RCU, 0, 0, 464 - NULL); 465 - } 466 - EXPORT_SYMBOL(kmem_cache_create_rcu); 338 + EXPORT_SYMBOL(__kmem_cache_create_args); 467 339 468 340 static struct kmem_cache *kmem_buckets_cache __ro_after_init; 469 341 ··· 440 534 fail: 441 535 for (idx = 0; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++) 442 536 kmem_cache_destroy((*b)[idx]); 443 - kfree(b); 537 + kmem_cache_free(kmem_buckets_cache, b); 444 538 445 539 return NULL; 446 540 } 447 541 EXPORT_SYMBOL(kmem_buckets_create); 448 542 449 - #ifdef SLAB_SUPPORTS_SYSFS 450 543 /* 451 544 * For a given kmem_cache, kmem_cache_destroy() should only be called 452 545 * once or there will be a use-after-free problem. The actual deletion 453 546 * and release of the kobject does not need slab_mutex or cpu_hotplug_lock 454 547 * protection. So they are now done without holding those locks. 455 - * 456 - * Note that there will be a slight delay in the deletion of sysfs files 457 - * if kmem_cache_release() is called indrectly from a work function. 458 548 */ 459 549 static void kmem_cache_release(struct kmem_cache *s) 460 550 { 461 - if (slab_state >= FULL) { 462 - sysfs_slab_unlink(s); 551 + kfence_shutdown_cache(s); 552 + if (__is_defined(SLAB_SUPPORTS_SYSFS) && slab_state >= FULL) 463 553 sysfs_slab_release(s); 464 - } else { 554 + else 465 555 slab_kmem_cache_release(s); 466 - } 467 - } 468 - #else 469 - static void kmem_cache_release(struct kmem_cache *s) 470 - { 471 - slab_kmem_cache_release(s); 472 - } 473 - #endif 474 - 475 - static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) 476 - { 477 - LIST_HEAD(to_destroy); 478 - struct kmem_cache *s, *s2; 479 - 480 - /* 481 - * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the 482 - * @slab_caches_to_rcu_destroy list. The slab pages are freed 483 - * through RCU and the associated kmem_cache are dereferenced 484 - * while freeing the pages, so the kmem_caches should be freed only 485 - * after the pending RCU operations are finished. As rcu_barrier() 486 - * is a pretty slow operation, we batch all pending destructions 487 - * asynchronously. 488 - */ 489 - mutex_lock(&slab_mutex); 490 - list_splice_init(&slab_caches_to_rcu_destroy, &to_destroy); 491 - mutex_unlock(&slab_mutex); 492 - 493 - if (list_empty(&to_destroy)) 494 - return; 495 - 496 - rcu_barrier(); 497 - 498 - list_for_each_entry_safe(s, s2, &to_destroy, list) { 499 - debugfs_slab_release(s); 500 - kfence_shutdown_cache(s); 501 - kmem_cache_release(s); 502 - } 503 - } 504 - 505 - static int shutdown_cache(struct kmem_cache *s) 506 - { 507 - /* free asan quarantined objects */ 508 - kasan_cache_shutdown(s); 509 - 510 - if (__kmem_cache_shutdown(s) != 0) 511 - return -EBUSY; 512 - 513 - list_del(&s->list); 514 - 515 - if (s->flags & SLAB_TYPESAFE_BY_RCU) { 516 - list_add_tail(&s->list, &slab_caches_to_rcu_destroy); 517 - schedule_work(&slab_caches_to_rcu_destroy_work); 518 - } else { 519 - kfence_shutdown_cache(s); 520 - debugfs_slab_release(s); 521 - } 522 - 523 - return 0; 524 556 } 525 557 526 558 void slab_kmem_cache_release(struct kmem_cache *s) ··· 470 626 471 627 void kmem_cache_destroy(struct kmem_cache *s) 472 628 { 473 - int err = -EBUSY; 474 - bool rcu_set; 629 + int err; 475 630 476 631 if (unlikely(!s) || !kasan_check_byte(s)) 477 632 return; 478 633 634 + /* in-flight kfree_rcu()'s may include objects from our cache */ 635 + kvfree_rcu_barrier(); 636 + 637 + if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) && 638 + (s->flags & SLAB_TYPESAFE_BY_RCU)) { 639 + /* 640 + * Under CONFIG_SLUB_RCU_DEBUG, when objects in a 641 + * SLAB_TYPESAFE_BY_RCU slab are freed, SLUB will internally 642 + * defer their freeing with call_rcu(). 643 + * Wait for such call_rcu() invocations here before actually 644 + * destroying the cache. 645 + * 646 + * It doesn't matter that we haven't looked at the slab refcount 647 + * yet - slabs with SLAB_TYPESAFE_BY_RCU can't be merged, so 648 + * the refcount should be 1 here. 649 + */ 650 + rcu_barrier(); 651 + } 652 + 479 653 cpus_read_lock(); 480 654 mutex_lock(&slab_mutex); 481 655 482 - rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU; 483 - 484 656 s->refcount--; 485 - if (s->refcount) 486 - goto out_unlock; 657 + if (s->refcount) { 658 + mutex_unlock(&slab_mutex); 659 + cpus_read_unlock(); 660 + return; 661 + } 487 662 488 - err = shutdown_cache(s); 663 + /* free asan quarantined objects */ 664 + kasan_cache_shutdown(s); 665 + 666 + err = __kmem_cache_shutdown(s); 489 667 WARN(err, "%s %s: Slab cache still has objects when called from %pS", 490 668 __func__, s->name, (void *)_RET_IP_); 491 - out_unlock: 669 + 670 + list_del(&s->list); 671 + 492 672 mutex_unlock(&slab_mutex); 493 673 cpus_read_unlock(); 494 - if (!err && !rcu_set) 495 - kmem_cache_release(s); 674 + 675 + if (slab_state >= FULL) 676 + sysfs_slab_unlink(s); 677 + debugfs_slab_release(s); 678 + 679 + if (err) 680 + return; 681 + 682 + if (s->flags & SLAB_TYPESAFE_BY_RCU) 683 + rcu_barrier(); 684 + 685 + kmem_cache_release(s); 496 686 } 497 687 EXPORT_SYMBOL(kmem_cache_destroy); 498 688 ··· 638 760 { 639 761 int err; 640 762 unsigned int align = ARCH_KMALLOC_MINALIGN; 641 - 642 - s->name = name; 643 - s->size = s->object_size = size; 763 + struct kmem_cache_args kmem_args = {}; 644 764 645 765 /* 646 766 * kmalloc caches guarantee alignment of at least the largest ··· 647 771 */ 648 772 if (flags & SLAB_KMALLOC) 649 773 align = max(align, 1U << (ffs(size) - 1)); 650 - s->align = calculate_alignment(flags, align, size); 774 + kmem_args.align = calculate_alignment(flags, align, size); 651 775 652 776 #ifdef CONFIG_HARDENED_USERCOPY 653 - s->useroffset = useroffset; 654 - s->usersize = usersize; 777 + kmem_args.useroffset = useroffset; 778 + kmem_args.usersize = usersize; 655 779 #endif 656 780 657 - err = __kmem_cache_create(s, flags); 781 + err = do_kmem_cache_create(s, name, size, &kmem_args, flags); 658 782 659 783 if (err) 660 784 panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n",

+271 -141

mm/slub.c

··· 750 750 return false; 751 751 } 752 752 753 + /* 754 + * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API 755 + * family will round up the real request size to these fixed ones, so 756 + * there could be an extra area than what is requested. Save the original 757 + * request size in the meta data area, for better debug and sanity check. 758 + */ 759 + static inline void set_orig_size(struct kmem_cache *s, 760 + void *object, unsigned int orig_size) 761 + { 762 + void *p = kasan_reset_tag(object); 763 + unsigned int kasan_meta_size; 764 + 765 + if (!slub_debug_orig_size(s)) 766 + return; 767 + 768 + /* 769 + * KASAN can save its free meta data inside of the object at offset 0. 770 + * If this meta data size is larger than 'orig_size', it will overlap 771 + * the data redzone in [orig_size+1, object_size]. Thus, we adjust 772 + * 'orig_size' to be as at least as big as KASAN's meta data. 773 + */ 774 + kasan_meta_size = kasan_metadata_size(s, true); 775 + if (kasan_meta_size > orig_size) 776 + orig_size = kasan_meta_size; 777 + 778 + p += get_info_end(s); 779 + p += sizeof(struct track) * 2; 780 + 781 + *(unsigned int *)p = orig_size; 782 + } 783 + 784 + static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) 785 + { 786 + void *p = kasan_reset_tag(object); 787 + 788 + if (!slub_debug_orig_size(s)) 789 + return s->object_size; 790 + 791 + p += get_info_end(s); 792 + p += sizeof(struct track) * 2; 793 + 794 + return *(unsigned int *)p; 795 + } 796 + 753 797 #ifdef CONFIG_SLUB_DEBUG 754 798 static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; 755 799 static DEFINE_SPINLOCK(object_map_lock); ··· 1021 977 pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n", 1022 978 slab, slab->objects, slab->inuse, slab->freelist, 1023 979 &slab->__page_flags); 1024 - } 1025 - 1026 - /* 1027 - * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API 1028 - * family will round up the real request size to these fixed ones, so 1029 - * there could be an extra area than what is requested. Save the original 1030 - * request size in the meta data area, for better debug and sanity check. 1031 - */ 1032 - static inline void set_orig_size(struct kmem_cache *s, 1033 - void *object, unsigned int orig_size) 1034 - { 1035 - void *p = kasan_reset_tag(object); 1036 - unsigned int kasan_meta_size; 1037 - 1038 - if (!slub_debug_orig_size(s)) 1039 - return; 1040 - 1041 - /* 1042 - * KASAN can save its free meta data inside of the object at offset 0. 1043 - * If this meta data size is larger than 'orig_size', it will overlap 1044 - * the data redzone in [orig_size+1, object_size]. Thus, we adjust 1045 - * 'orig_size' to be as at least as big as KASAN's meta data. 1046 - */ 1047 - kasan_meta_size = kasan_metadata_size(s, true); 1048 - if (kasan_meta_size > orig_size) 1049 - orig_size = kasan_meta_size; 1050 - 1051 - p += get_info_end(s); 1052 - p += sizeof(struct track) * 2; 1053 - 1054 - *(unsigned int *)p = orig_size; 1055 - } 1056 - 1057 - static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) 1058 - { 1059 - void *p = kasan_reset_tag(object); 1060 - 1061 - if (!slub_debug_orig_size(s)) 1062 - return s->object_size; 1063 - 1064 - p += get_info_end(s); 1065 - p += sizeof(struct track) * 2; 1066 - 1067 - return *(unsigned int *)p; 1068 980 } 1069 981 1070 982 void skip_orig_size_check(struct kmem_cache *s, const void *object) ··· 1888 1888 int objects) {} 1889 1889 static inline void dec_slabs_node(struct kmem_cache *s, int node, 1890 1890 int objects) {} 1891 - 1892 1891 #ifndef CONFIG_SLUB_TINY 1893 1892 static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, 1894 1893 void **freelist, void *nextfree) ··· 2182 2183 2183 2184 __memcg_slab_free_hook(s, slab, p, objects, obj_exts); 2184 2185 } 2186 + 2187 + static __fastpath_inline 2188 + bool memcg_slab_post_charge(void *p, gfp_t flags) 2189 + { 2190 + struct slabobj_ext *slab_exts; 2191 + struct kmem_cache *s; 2192 + struct folio *folio; 2193 + struct slab *slab; 2194 + unsigned long off; 2195 + 2196 + folio = virt_to_folio(p); 2197 + if (!folio_test_slab(folio)) { 2198 + return folio_memcg_kmem(folio) || 2199 + (__memcg_kmem_charge_page(folio_page(folio, 0), flags, 2200 + folio_order(folio)) == 0); 2201 + } 2202 + 2203 + slab = folio_slab(folio); 2204 + s = slab->slab_cache; 2205 + 2206 + /* 2207 + * Ignore KMALLOC_NORMAL cache to avoid possible circular dependency 2208 + * of slab_obj_exts being allocated from the same slab and thus the slab 2209 + * becoming effectively unfreeable. 2210 + */ 2211 + if (is_kmalloc_normal(s)) 2212 + return true; 2213 + 2214 + /* Ignore already charged objects. */ 2215 + slab_exts = slab_obj_exts(slab); 2216 + if (slab_exts) { 2217 + off = obj_to_index(s, slab, p); 2218 + if (unlikely(slab_exts[off].objcg)) 2219 + return true; 2220 + } 2221 + 2222 + return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p); 2223 + } 2224 + 2185 2225 #else /* CONFIG_MEMCG */ 2186 2226 static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s, 2187 2227 struct list_lru *lru, ··· 2234 2196 void **p, int objects) 2235 2197 { 2236 2198 } 2199 + 2200 + static inline bool memcg_slab_post_charge(void *p, gfp_t flags) 2201 + { 2202 + return true; 2203 + } 2237 2204 #endif /* CONFIG_MEMCG */ 2205 + 2206 + #ifdef CONFIG_SLUB_RCU_DEBUG 2207 + static void slab_free_after_rcu_debug(struct rcu_head *rcu_head); 2208 + 2209 + struct rcu_delayed_free { 2210 + struct rcu_head head; 2211 + void *object; 2212 + }; 2213 + #endif 2238 2214 2239 2215 /* 2240 2216 * Hooks for other subsystems that check memory allocations. In a typical 2241 2217 * production configuration these hooks all should produce no code at all. 2242 2218 * 2243 2219 * Returns true if freeing of the object can proceed, false if its reuse 2244 - * was delayed by KASAN quarantine, or it was returned to KFENCE. 2220 + * was delayed by CONFIG_SLUB_RCU_DEBUG or KASAN quarantine, or it was returned 2221 + * to KFENCE. 2245 2222 */ 2246 2223 static __always_inline 2247 - bool slab_free_hook(struct kmem_cache *s, void *x, bool init) 2224 + bool slab_free_hook(struct kmem_cache *s, void *x, bool init, 2225 + bool after_rcu_delay) 2248 2226 { 2227 + /* Are the object contents still accessible? */ 2228 + bool still_accessible = (s->flags & SLAB_TYPESAFE_BY_RCU) && !after_rcu_delay; 2229 + 2249 2230 kmemleak_free_recursive(x, s->flags); 2250 2231 kmsan_slab_free(s, x); 2251 2232 ··· 2274 2217 debug_check_no_obj_freed(x, s->object_size); 2275 2218 2276 2219 /* Use KCSAN to help debug racy use-after-free. */ 2277 - if (!(s->flags & SLAB_TYPESAFE_BY_RCU)) 2220 + if (!still_accessible) 2278 2221 __kcsan_check_access(x, s->object_size, 2279 2222 KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); 2280 2223 2281 2224 if (kfence_free(x)) 2282 2225 return false; 2226 + 2227 + /* 2228 + * Give KASAN a chance to notice an invalid free operation before we 2229 + * modify the object. 2230 + */ 2231 + if (kasan_slab_pre_free(s, x)) 2232 + return false; 2233 + 2234 + #ifdef CONFIG_SLUB_RCU_DEBUG 2235 + if (still_accessible) { 2236 + struct rcu_delayed_free *delayed_free; 2237 + 2238 + delayed_free = kmalloc(sizeof(*delayed_free), GFP_NOWAIT); 2239 + if (delayed_free) { 2240 + /* 2241 + * Let KASAN track our call stack as a "related work 2242 + * creation", just like if the object had been freed 2243 + * normally via kfree_rcu(). 2244 + * We have to do this manually because the rcu_head is 2245 + * not located inside the object. 2246 + */ 2247 + kasan_record_aux_stack_noalloc(x); 2248 + 2249 + delayed_free->object = x; 2250 + call_rcu(&delayed_free->head, slab_free_after_rcu_debug); 2251 + return false; 2252 + } 2253 + } 2254 + #endif /* CONFIG_SLUB_RCU_DEBUG */ 2283 2255 2284 2256 /* 2285 2257 * As memory initialization might be integrated into KASAN, ··· 2323 2237 */ 2324 2238 if (unlikely(init)) { 2325 2239 int rsize; 2326 - unsigned int inuse; 2240 + unsigned int inuse, orig_size; 2327 2241 2328 2242 inuse = get_info_end(s); 2243 + orig_size = get_orig_size(s, x); 2329 2244 if (!kasan_has_integrated_init()) 2330 - memset(kasan_reset_tag(x), 0, s->object_size); 2245 + memset(kasan_reset_tag(x), 0, orig_size); 2331 2246 rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0; 2332 2247 memset((char *)kasan_reset_tag(x) + inuse, 0, 2333 2248 s->size - inuse - rsize); 2249 + /* 2250 + * Restore orig_size, otherwize kmalloc redzone overwritten 2251 + * would be reported 2252 + */ 2253 + set_orig_size(s, x, orig_size); 2254 + 2334 2255 } 2335 2256 /* KASAN might put x into memory quarantine, delaying its reuse. */ 2336 - return !kasan_slab_free(s, x, init); 2257 + return !kasan_slab_free(s, x, init, still_accessible); 2337 2258 } 2338 2259 2339 2260 static __fastpath_inline ··· 2354 2261 bool init; 2355 2262 2356 2263 if (is_kfence_address(next)) { 2357 - slab_free_hook(s, next, false); 2264 + slab_free_hook(s, next, false, false); 2358 2265 return false; 2359 2266 } 2360 2267 ··· 2369 2276 next = get_freepointer(s, object); 2370 2277 2371 2278 /* If object's reuse doesn't have to be delayed */ 2372 - if (likely(slab_free_hook(s, object, init))) { 2279 + if (likely(slab_free_hook(s, object, init, false))) { 2373 2280 /* Move object to the new freelist */ 2374 2281 set_freepointer(s, object, *head); 2375 2282 *head = object; ··· 2409 2316 struct slab *slab; 2410 2317 unsigned int order = oo_order(oo); 2411 2318 2412 - folio = (struct folio *)alloc_pages_node(node, flags, order); 2319 + if (node == NUMA_NO_NODE) 2320 + folio = (struct folio *)alloc_pages(flags, order); 2321 + else 2322 + folio = (struct folio *)__alloc_pages_node(node, flags, order); 2323 + 2413 2324 if (!folio) 2414 2325 return NULL; 2415 2326 ··· 3511 3414 { 3512 3415 static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, 3513 3416 DEFAULT_RATELIMIT_BURST); 3417 + int cpu = raw_smp_processor_id(); 3514 3418 int node; 3515 3419 struct kmem_cache_node *n; 3516 3420 3517 3421 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) 3518 3422 return; 3519 3423 3520 - pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", 3521 - nid, gfpflags, &gfpflags); 3424 + pr_warn("SLUB: Unable to allocate memory on CPU %u (of node %d) on node %d, gfp=%#x(%pGg)\n", 3425 + cpu, cpu_to_node(cpu), nid, gfpflags, &gfpflags); 3522 3426 pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n", 3523 3427 s->name, s->object_size, s->size, oo_order(s->oo), 3524 3428 oo_order(s->min)); ··· 4018 3920 * If the object has been wiped upon free, make sure it's fully initialized by 4019 3921 * zeroing out freelist pointer. 4020 3922 * 4021 - * Note that we also wipe custom freelist pointers specified via 4022 - * s->rcu_freeptr_offset. 3923 + * Note that we also wipe custom freelist pointers. 4023 3924 */ 4024 3925 static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, 4025 3926 void *obj) ··· 4159 4062 return ret; 4160 4063 } 4161 4064 EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof); 4065 + 4066 + bool kmem_cache_charge(void *objp, gfp_t gfpflags) 4067 + { 4068 + if (!memcg_kmem_online()) 4069 + return true; 4070 + 4071 + return memcg_slab_post_charge(objp, gfpflags); 4072 + } 4073 + EXPORT_SYMBOL(kmem_cache_charge); 4162 4074 4163 4075 /** 4164 4076 * kmem_cache_alloc_node - Allocate an object on the specified node ··· 4577 4471 memcg_slab_free_hook(s, slab, &object, 1); 4578 4472 alloc_tagging_slab_free_hook(s, slab, &object, 1); 4579 4473 4580 - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s)))) 4474 + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) 4581 4475 do_slab_free(s, slab, object, object, 1, addr); 4582 4476 } 4583 4477 ··· 4586 4480 static noinline 4587 4481 void memcg_alloc_abort_single(struct kmem_cache *s, void *object) 4588 4482 { 4589 - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s)))) 4483 + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) 4590 4484 do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); 4591 4485 } 4592 4486 #endif ··· 4604 4498 if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) 4605 4499 do_slab_free(s, slab, head, tail, cnt, addr); 4606 4500 } 4501 + 4502 + #ifdef CONFIG_SLUB_RCU_DEBUG 4503 + static void slab_free_after_rcu_debug(struct rcu_head *rcu_head) 4504 + { 4505 + struct rcu_delayed_free *delayed_free = 4506 + container_of(rcu_head, struct rcu_delayed_free, head); 4507 + void *object = delayed_free->object; 4508 + struct slab *slab = virt_to_slab(object); 4509 + struct kmem_cache *s; 4510 + 4511 + kfree(delayed_free); 4512 + 4513 + if (WARN_ON(is_kfence_address(object))) 4514 + return; 4515 + 4516 + /* find the object and the cache again */ 4517 + if (WARN_ON(!slab)) 4518 + return; 4519 + s = slab->slab_cache; 4520 + if (WARN_ON(!(s->flags & SLAB_TYPESAFE_BY_RCU))) 4521 + return; 4522 + 4523 + /* resume freeing */ 4524 + if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) 4525 + do_slab_free(s, slab, object, object, 1, _THIS_IP_); 4526 + } 4527 + #endif /* CONFIG_SLUB_RCU_DEBUG */ 4607 4528 4608 4529 #ifdef CONFIG_KASAN_GENERIC 4609 4530 void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) ··· 5278 5145 #endif 5279 5146 } 5280 5147 5281 - /* Was a valid freeptr offset requested? */ 5282 - static inline bool has_freeptr_offset(const struct kmem_cache *s) 5283 - { 5284 - return s->rcu_freeptr_offset != UINT_MAX; 5285 - } 5286 - 5287 5148 /* 5288 5149 * calculate_sizes() determines the order and the distribution of data within 5289 5150 * a slab object. 5290 5151 */ 5291 - static int calculate_sizes(struct kmem_cache *s) 5152 + static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) 5292 5153 { 5293 5154 slab_flags_t flags = s->flags; 5294 5155 unsigned int size = s->object_size; ··· 5323 5196 */ 5324 5197 s->inuse = size; 5325 5198 5326 - if (((flags & SLAB_TYPESAFE_BY_RCU) && !has_freeptr_offset(s)) || 5199 + if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) || 5327 5200 (flags & SLAB_POISON) || s->ctor || 5328 5201 ((flags & SLAB_RED_ZONE) && 5329 5202 (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) { ··· 5345 5218 */ 5346 5219 s->offset = size; 5347 5220 size += sizeof(void *); 5348 - } else if ((flags & SLAB_TYPESAFE_BY_RCU) && has_freeptr_offset(s)) { 5349 - s->offset = s->rcu_freeptr_offset; 5221 + } else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) { 5222 + s->offset = args->freeptr_offset; 5350 5223 } else { 5351 5224 /* 5352 5225 * Store freelist pointer near middle of object to keep ··· 5419 5292 s->min = oo_make(get_order(size), size); 5420 5293 5421 5294 return !!oo_objects(s->oo); 5422 - } 5423 - 5424 - static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) 5425 - { 5426 - s->flags = kmem_cache_flags(flags, s->name); 5427 - #ifdef CONFIG_SLAB_FREELIST_HARDENED 5428 - s->random = get_random_long(); 5429 - #endif 5430 - 5431 - if (!calculate_sizes(s)) 5432 - goto error; 5433 - if (disable_higher_order_debug) { 5434 - /* 5435 - * Disable debugging flags that store metadata if the min slab 5436 - * order increased. 5437 - */ 5438 - if (get_order(s->size) > get_order(s->object_size)) { 5439 - s->flags &= ~DEBUG_METADATA_FLAGS; 5440 - s->offset = 0; 5441 - if (!calculate_sizes(s)) 5442 - goto error; 5443 - } 5444 - } 5445 - 5446 - #ifdef system_has_freelist_aba 5447 - if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) { 5448 - /* Enable fast mode */ 5449 - s->flags |= __CMPXCHG_DOUBLE; 5450 - } 5451 - #endif 5452 - 5453 - /* 5454 - * The larger the object size is, the more slabs we want on the partial 5455 - * list to avoid pounding the page allocator excessively. 5456 - */ 5457 - s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); 5458 - s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); 5459 - 5460 - set_cpu_partial(s); 5461 - 5462 - #ifdef CONFIG_NUMA 5463 - s->remote_node_defrag_ratio = 1000; 5464 - #endif 5465 - 5466 - /* Initialize the pre-computed randomized freelist if slab is up */ 5467 - if (slab_state >= UP) { 5468 - if (init_cache_random_seq(s)) 5469 - goto error; 5470 - } 5471 - 5472 - if (!init_kmem_cache_nodes(s)) 5473 - goto error; 5474 - 5475 - if (alloc_kmem_cache_cpus(s)) 5476 - return 0; 5477 - 5478 - error: 5479 - __kmem_cache_release(s); 5480 - return -EINVAL; 5481 5295 } 5482 5296 5483 5297 static void list_slab_objects(struct kmem_cache *s, struct slab *slab, ··· 5974 5906 return s; 5975 5907 } 5976 5908 5977 - int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) 5909 + int do_kmem_cache_create(struct kmem_cache *s, const char *name, 5910 + unsigned int size, struct kmem_cache_args *args, 5911 + slab_flags_t flags) 5978 5912 { 5979 - int err; 5913 + int err = -EINVAL; 5980 5914 5981 - err = kmem_cache_open(s, flags); 5982 - if (err) 5983 - return err; 5915 + s->name = name; 5916 + s->size = s->object_size = size; 5917 + 5918 + s->flags = kmem_cache_flags(flags, s->name); 5919 + #ifdef CONFIG_SLAB_FREELIST_HARDENED 5920 + s->random = get_random_long(); 5921 + #endif 5922 + s->align = args->align; 5923 + s->ctor = args->ctor; 5924 + #ifdef CONFIG_HARDENED_USERCOPY 5925 + s->useroffset = args->useroffset; 5926 + s->usersize = args->usersize; 5927 + #endif 5928 + 5929 + if (!calculate_sizes(args, s)) 5930 + goto out; 5931 + if (disable_higher_order_debug) { 5932 + /* 5933 + * Disable debugging flags that store metadata if the min slab 5934 + * order increased. 5935 + */ 5936 + if (get_order(s->size) > get_order(s->object_size)) { 5937 + s->flags &= ~DEBUG_METADATA_FLAGS; 5938 + s->offset = 0; 5939 + if (!calculate_sizes(args, s)) 5940 + goto out; 5941 + } 5942 + } 5943 + 5944 + #ifdef system_has_freelist_aba 5945 + if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) { 5946 + /* Enable fast mode */ 5947 + s->flags |= __CMPXCHG_DOUBLE; 5948 + } 5949 + #endif 5950 + 5951 + /* 5952 + * The larger the object size is, the more slabs we want on the partial 5953 + * list to avoid pounding the page allocator excessively. 5954 + */ 5955 + s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); 5956 + s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); 5957 + 5958 + set_cpu_partial(s); 5959 + 5960 + #ifdef CONFIG_NUMA 5961 + s->remote_node_defrag_ratio = 1000; 5962 + #endif 5963 + 5964 + /* Initialize the pre-computed randomized freelist if slab is up */ 5965 + if (slab_state >= UP) { 5966 + if (init_cache_random_seq(s)) 5967 + goto out; 5968 + } 5969 + 5970 + if (!init_kmem_cache_nodes(s)) 5971 + goto out; 5972 + 5973 + if (!alloc_kmem_cache_cpus(s)) 5974 + goto out; 5984 5975 5985 5976 /* Mutex is not taken during early boot */ 5986 - if (slab_state <= UP) 5987 - return 0; 5977 + if (slab_state <= UP) { 5978 + err = 0; 5979 + goto out; 5980 + } 5988 5981 5989 5982 err = sysfs_slab_add(s); 5990 - if (err) { 5991 - __kmem_cache_release(s); 5992 - return err; 5993 - } 5983 + if (err) 5984 + goto out; 5994 5985 5995 5986 if (s->flags & SLAB_STORE_USER) 5996 5987 debugfs_slab_add(s); 5997 5988 5998 - return 0; 5989 + out: 5990 + if (err) 5991 + __kmem_cache_release(s); 5992 + return err; 5999 5993 } 6000 5994 6001 5995 #ifdef SLAB_SUPPORTS_SYSFS

+3 -2

net/ipv4/inet_connection_sock.c

··· 714 714 out: 715 715 release_sock(sk); 716 716 if (newsk && mem_cgroup_sockets_enabled) { 717 + gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; 717 718 int amt = 0; 718 719 719 720 /* atomically get the memory usage, set and charge the ··· 732 731 } 733 732 734 733 if (amt) 735 - mem_cgroup_charge_skmem(newsk->sk_memcg, amt, 736 - GFP_KERNEL | __GFP_NOFAIL); 734 + mem_cgroup_charge_skmem(newsk->sk_memcg, amt, gfp); 735 + kmem_cache_charge(newsk, gfp); 737 736 738 737 release_sock(newsk); 739 738 }

Configure Feed

Configure Feed