Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'slab-for-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab

Pull slab updates from Vlastimil Babka:

- A new layer for caching objects for allocation and free via percpu
arrays called sheaves.

The aim is to combine the good parts of SLAB (lower-overhead and
simpler percpu caching, compared to SLUB) without the past issues
with arrays for freeing remote NUMA node objects and their flushing.

It also allows more efficient kfree_rcu(), and cheaper object
preallocations for cases where the exact number of objects is
unknown, but an upper bound is.

Currently VMAs and maple nodes are using this new caching, with a
plan to enable it for all caches and remove the complex SLUB fastpath
based on cpu (partial) slabs and this_cpu_cmpxchg_double().
(Vlastimil Babka, with Liam Howlett and Pedro Falcato for the maple
tree changes)

- Re-entrant kmalloc_nolock(), which allows opportunistic allocations
from NMI and tracing/kprobe contexts.

Building on prior page allocator and memcg changes, it will result in
removing BPF-specific caches on top of slab (Alexei Starovoitov)

- Various fixes and cleanups. (Kuan-Wei Chiu, Matthew Wilcox, Suren
Baghdasaryan, Ye Liu)

* tag 'slab-for-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab: (40 commits)
slab: Introduce kmalloc_nolock() and kfree_nolock().
slab: Reuse first bit for OBJEXTS_ALLOC_FAIL
slab: Make slub local_(try)lock more precise for LOCKDEP
mm: Introduce alloc_frozen_pages_nolock()
mm: Allow GFP_ACCOUNT to be used in alloc_pages_nolock().
locking/local_lock: Introduce local_lock_is_locked().
maple_tree: Convert forking to use the sheaf interface
maple_tree: Add single node allocation support to maple state
maple_tree: Prefilled sheaf conversion and testing
tools/testing: Add support for prefilled slab sheafs
maple_tree: Replace mt_free_one() with kfree()
maple_tree: Use kfree_rcu in ma_free_rcu
testing/radix-tree/maple: Hack around kfree_rcu not existing
tools/testing: include maple-shim.c in maple.c
maple_tree: use percpu sheaves for maple_node_cache
mm, vma: use percpu sheaves for vm_area_struct cache
tools/testing: Add support for changes to slab for sheaves
slab: allow NUMA restricted allocations to use percpu sheaves
tools/testing/vma: Implement vm_refcnt reset
slab: skip percpu sheaves for remote object freeing
...

+2909 -1592
+1 -1
include/linux/gfp.h
··· 354 354 } 355 355 #define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) 356 356 357 - struct page *alloc_pages_nolock_noprof(int nid, unsigned int order); 357 + struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); 358 358 #define alloc_pages_nolock(...) alloc_hooks(alloc_pages_nolock_noprof(__VA_ARGS__)) 359 359 360 360 extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);
+8 -5
include/linux/kasan.h
··· 200 200 } 201 201 202 202 bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init, 203 - bool still_accessible); 203 + bool still_accessible, bool no_quarantine); 204 204 /** 205 205 * kasan_slab_free - Poison, initialize, and quarantine a slab object. 206 206 * @object: Object to be freed. ··· 226 226 * @Return true if KASAN took ownership of the object; false otherwise. 227 227 */ 228 228 static __always_inline bool kasan_slab_free(struct kmem_cache *s, 229 - void *object, bool init, 230 - bool still_accessible) 229 + void *object, bool init, 230 + bool still_accessible, 231 + bool no_quarantine) 231 232 { 232 233 if (kasan_enabled()) 233 - return __kasan_slab_free(s, object, init, still_accessible); 234 + return __kasan_slab_free(s, object, init, still_accessible, 235 + no_quarantine); 234 236 return false; 235 237 } 236 238 ··· 429 427 } 430 428 431 429 static inline bool kasan_slab_free(struct kmem_cache *s, void *object, 432 - bool init, bool still_accessible) 430 + bool init, bool still_accessible, 431 + bool no_quarantine) 433 432 { 434 433 return false; 435 434 }
+2
include/linux/local_lock.h
··· 66 66 */ 67 67 #define local_trylock(lock) __local_trylock(this_cpu_ptr(lock)) 68 68 69 + #define local_lock_is_locked(lock) __local_lock_is_locked(lock) 70 + 69 71 /** 70 72 * local_trylock_irqsave - Try to acquire a per CPU local lock, save and disable 71 73 * interrupts if acquired
+13 -3
include/linux/local_lock_internal.h
··· 17 17 18 18 /* local_trylock() and local_trylock_irqsave() only work with local_trylock_t */ 19 19 typedef struct { 20 - local_lock_t llock; 20 + #ifdef CONFIG_DEBUG_LOCK_ALLOC 21 + struct lockdep_map dep_map; 22 + struct task_struct *owner; 23 + #endif 21 24 u8 acquired; 22 25 } local_trylock_t; 23 26 ··· 34 31 .owner = NULL, 35 32 36 33 # define LOCAL_TRYLOCK_DEBUG_INIT(lockname) \ 37 - .llock = { LOCAL_LOCK_DEBUG_INIT((lockname).llock) }, 34 + LOCAL_LOCK_DEBUG_INIT(lockname) 38 35 39 36 static inline void local_lock_acquire(local_lock_t *l) 40 37 { ··· 84 81 local_lock_debug_init(lock); \ 85 82 } while (0) 86 83 87 - #define __local_trylock_init(lock) __local_lock_init(lock.llock) 84 + #define __local_trylock_init(lock) __local_lock_init((local_lock_t *)lock) 88 85 89 86 #define __spinlock_nested_bh_init(lock) \ 90 87 do { \ ··· 164 161 } \ 165 162 !!tl; \ 166 163 }) 164 + 165 + /* preemption or migration must be disabled before calling __local_lock_is_locked */ 166 + #define __local_lock_is_locked(lock) READ_ONCE(this_cpu_ptr(lock)->acquired) 167 167 168 168 #define __local_lock_release(lock) \ 169 169 do { \ ··· 287 281 flags = 0; \ 288 282 __local_trylock(lock); \ 289 283 }) 284 + 285 + /* migration must be disabled before calling __local_lock_is_locked */ 286 + #define __local_lock_is_locked(__lock) \ 287 + (rt_mutex_owner(&this_cpu_ptr(__lock)->lock) == current) 290 288 291 289 #endif /* CONFIG_PREEMPT_RT */
+5 -1
include/linux/maple_tree.h
··· 442 442 struct maple_enode *node; /* The node containing this entry */ 443 443 unsigned long min; /* The minimum index of this node - implied pivot min */ 444 444 unsigned long max; /* The maximum index of this node - implied pivot max */ 445 - struct maple_alloc *alloc; /* Allocated nodes for this operation */ 445 + struct slab_sheaf *sheaf; /* Allocated nodes for this operation */ 446 + struct maple_node *alloc; /* A single allocated node for fast path writes */ 447 + unsigned long node_request; /* The number of nodes to allocate for this operation */ 446 448 enum maple_status status; /* The status of the state (active, start, none, etc) */ 447 449 unsigned char depth; /* depth of tree descent during write */ 448 450 unsigned char offset; ··· 492 490 .status = ma_start, \ 493 491 .min = 0, \ 494 492 .max = ULONG_MAX, \ 493 + .sheaf = NULL, \ 495 494 .alloc = NULL, \ 495 + .node_request = 0, \ 496 496 .mas_flags = 0, \ 497 497 .store_type = wr_invalid, \ 498 498 }
+10 -2
include/linux/memcontrol.h
··· 341 341 __NR_MEMCG_DATA_FLAGS = (1UL << 2), 342 342 }; 343 343 344 + #define __OBJEXTS_ALLOC_FAIL MEMCG_DATA_OBJEXTS 344 345 #define __FIRST_OBJEXT_FLAG __NR_MEMCG_DATA_FLAGS 345 346 346 347 #else /* CONFIG_MEMCG */ 347 348 349 + #define __OBJEXTS_ALLOC_FAIL (1UL << 0) 348 350 #define __FIRST_OBJEXT_FLAG (1UL << 0) 349 351 350 352 #endif /* CONFIG_MEMCG */ 351 353 352 354 enum objext_flags { 353 - /* slabobj_ext vector failed to allocate */ 354 - OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG, 355 + /* 356 + * Use bit 0 with zero other bits to signal that slabobj_ext vector 357 + * failed to allocate. The same bit 0 with valid upper bits means 358 + * MEMCG_DATA_OBJEXTS. 359 + */ 360 + OBJEXTS_ALLOC_FAIL = __OBJEXTS_ALLOC_FAIL, 361 + /* slabobj_ext vector allocated with kmalloc_nolock() */ 362 + OBJEXTS_NOSPIN_ALLOC = __FIRST_OBJEXT_FLAG, 355 363 /* the next bit after the last actual flag */ 356 364 __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1), 357 365 };
+10
include/linux/rtmutex.h
··· 44 44 return READ_ONCE(lock->owner) != NULL; 45 45 } 46 46 47 + #ifdef CONFIG_RT_MUTEXES 48 + #define RT_MUTEX_HAS_WAITERS 1UL 49 + 50 + static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) 51 + { 52 + unsigned long owner = (unsigned long) READ_ONCE(lock->owner); 53 + 54 + return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); 55 + } 56 + #endif 47 57 extern void rt_mutex_base_init(struct rt_mutex_base *rtb); 48 58 49 59 /**
+51
include/linux/slab.h
··· 335 335 * %NULL means no constructor. 336 336 */ 337 337 void (*ctor)(void *); 338 + /** 339 + * @sheaf_capacity: Enable sheaves of given capacity for the cache. 340 + * 341 + * With a non-zero value, allocations from the cache go through caching 342 + * arrays called sheaves. Each cpu has a main sheaf that's always 343 + * present, and a spare sheaf that may be not present. When both become 344 + * empty, there's an attempt to replace an empty sheaf with a full sheaf 345 + * from the per-node barn. 346 + * 347 + * When no full sheaf is available, and gfp flags allow blocking, a 348 + * sheaf is allocated and filled from slab(s) using bulk allocation. 349 + * Otherwise the allocation falls back to the normal operation 350 + * allocating a single object from a slab. 351 + * 352 + * Analogically when freeing and both percpu sheaves are full, the barn 353 + * may replace it with an empty sheaf, unless it's over capacity. In 354 + * that case a sheaf is bulk freed to slab pages. 355 + * 356 + * The sheaves do not enforce NUMA placement of objects, so allocations 357 + * via kmem_cache_alloc_node() with a node specified other than 358 + * NUMA_NO_NODE will bypass them. 359 + * 360 + * Bulk allocation and free operations also try to use the cpu sheaves 361 + * and barn, but fallback to using slab pages directly. 362 + * 363 + * When slub_debug is enabled for the cache, the sheaf_capacity argument 364 + * is ignored. 365 + * 366 + * %0 means no sheaves will be created. 367 + */ 368 + unsigned int sheaf_capacity; 338 369 }; 339 370 340 371 struct kmem_cache *__kmem_cache_create_args(const char *name, ··· 501 470 #define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__)) 502 471 503 472 void kfree(const void *objp); 473 + void kfree_nolock(const void *objp); 504 474 void kfree_sensitive(const void *objp); 505 475 size_t __ksize(const void *objp); 506 476 ··· 830 798 int node) __assume_slab_alignment __malloc; 831 799 #define kmem_cache_alloc_node(...) alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__)) 832 800 801 + struct slab_sheaf * 802 + kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size); 803 + 804 + int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, 805 + struct slab_sheaf **sheafp, unsigned int size); 806 + 807 + void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, 808 + struct slab_sheaf *sheaf); 809 + 810 + void *kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *cachep, gfp_t gfp, 811 + struct slab_sheaf *sheaf) __assume_slab_alignment __malloc; 812 + #define kmem_cache_alloc_from_sheaf(...) \ 813 + alloc_hooks(kmem_cache_alloc_from_sheaf_noprof(__VA_ARGS__)) 814 + 815 + unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf); 816 + 833 817 /* 834 818 * These macros allow declaring a kmem_buckets * parameter alongside size, which 835 819 * can be compiled out with CONFIG_SLAB_BUCKETS=n so that a large number of call ··· 957 909 return __kmalloc_noprof(size, flags); 958 910 } 959 911 #define kmalloc(...) alloc_hooks(kmalloc_noprof(__VA_ARGS__)) 912 + 913 + void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node); 914 + #define kmalloc_nolock(...) alloc_hooks(kmalloc_nolock_noprof(__VA_ARGS__)) 960 915 961 916 #define kmem_buckets_alloc(_b, _size, _flags) \ 962 917 alloc_hooks(__kmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE))
+1 -1
kernel/bpf/stream.c
··· 83 83 struct bpf_stream_page *stream_page, *old_stream_page; 84 84 struct page *page; 85 85 86 - page = alloc_pages_nolock(NUMA_NO_NODE, 0); 86 + page = alloc_pages_nolock(/* Don't account */ 0, NUMA_NO_NODE, 0); 87 87 if (!page) 88 88 return NULL; 89 89 stream_page = page_address(page);
+1 -1
kernel/bpf/syscall.c
··· 583 583 static struct page *__bpf_alloc_page(int nid) 584 584 { 585 585 if (!can_alloc_pages()) 586 - return alloc_pages_nolock(nid, 0); 586 + return alloc_pages_nolock(__GFP_ACCOUNT, nid, 0); 587 587 588 588 return alloc_pages_node(nid, 589 589 GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT
-9
kernel/locking/rtmutex_common.h
··· 153 153 pi_tree.entry); 154 154 } 155 155 156 - #define RT_MUTEX_HAS_WAITERS 1UL 157 - 158 - static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) 159 - { 160 - unsigned long owner = (unsigned long) READ_ONCE(lock->owner); 161 - 162 - return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); 163 - } 164 - 165 156 /* 166 157 * Constants for rt mutex functions which have a selectable deadlock 167 158 * detection.
+121 -558
lib/maple_tree.c
··· 83 83 84 84 /* 85 85 * Maple state flags 86 - * * MA_STATE_BULK - Bulk insert mode 87 - * * MA_STATE_REBALANCE - Indicate a rebalance during bulk insert 88 86 * * MA_STATE_PREALLOC - Preallocated nodes, WARN_ON allocation 89 87 */ 90 - #define MA_STATE_BULK 1 91 - #define MA_STATE_REBALANCE 2 92 - #define MA_STATE_PREALLOC 4 88 + #define MA_STATE_PREALLOC 1 93 89 94 90 #define ma_parent_ptr(x) ((struct maple_pnode *)(x)) 95 91 #define mas_tree_parent(x) ((unsigned long)(x->tree) | MA_ROOT_PARENT) ··· 172 176 return kmem_cache_alloc(maple_node_cache, gfp); 173 177 } 174 178 175 - static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) 176 - { 177 - return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); 178 - } 179 - 180 - static inline void mt_free_one(struct maple_node *node) 181 - { 182 - kmem_cache_free(maple_node_cache, node); 183 - } 184 - 185 179 static inline void mt_free_bulk(size_t size, void __rcu **nodes) 186 180 { 187 181 kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); 188 182 } 189 183 190 - static void mt_free_rcu(struct rcu_head *head) 184 + static void mt_return_sheaf(struct slab_sheaf *sheaf) 191 185 { 192 - struct maple_node *node = container_of(head, struct maple_node, rcu); 186 + kmem_cache_return_sheaf(maple_node_cache, GFP_NOWAIT, sheaf); 187 + } 193 188 194 - kmem_cache_free(maple_node_cache, node); 189 + static struct slab_sheaf *mt_get_sheaf(gfp_t gfp, int count) 190 + { 191 + return kmem_cache_prefill_sheaf(maple_node_cache, gfp, count); 192 + } 193 + 194 + static int mt_refill_sheaf(gfp_t gfp, struct slab_sheaf **sheaf, 195 + unsigned int size) 196 + { 197 + return kmem_cache_refill_sheaf(maple_node_cache, gfp, sheaf, size); 195 198 } 196 199 197 200 /* ··· 203 208 static void ma_free_rcu(struct maple_node *node) 204 209 { 205 210 WARN_ON(node->parent != ma_parent_ptr(node)); 206 - call_rcu(&node->rcu, mt_free_rcu); 211 + kfree_rcu(node, rcu); 207 212 } 208 213 209 214 static void mt_set_height(struct maple_tree *mt, unsigned char height) ··· 583 588 584 589 node = mte_to_node(enode); 585 590 return ma_dead_node(node); 586 - } 587 - 588 - /* 589 - * mas_allocated() - Get the number of nodes allocated in a maple state. 590 - * @mas: The maple state 591 - * 592 - * The ma_state alloc member is overloaded to hold a pointer to the first 593 - * allocated node or to the number of requested nodes to allocate. If bit 0 is 594 - * set, then the alloc contains the number of requested nodes. If there is an 595 - * allocated node, then the total allocated nodes is in that node. 596 - * 597 - * Return: The total number of nodes allocated 598 - */ 599 - static inline unsigned long mas_allocated(const struct ma_state *mas) 600 - { 601 - if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) 602 - return 0; 603 - 604 - return mas->alloc->total; 605 - } 606 - 607 - /* 608 - * mas_set_alloc_req() - Set the requested number of allocations. 609 - * @mas: the maple state 610 - * @count: the number of allocations. 611 - * 612 - * The requested number of allocations is either in the first allocated node, 613 - * located in @mas->alloc->request_count, or directly in @mas->alloc if there is 614 - * no allocated node. Set the request either in the node or do the necessary 615 - * encoding to store in @mas->alloc directly. 616 - */ 617 - static inline void mas_set_alloc_req(struct ma_state *mas, unsigned long count) 618 - { 619 - if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) { 620 - if (!count) 621 - mas->alloc = NULL; 622 - else 623 - mas->alloc = (struct maple_alloc *)(((count) << 1U) | 1U); 624 - return; 625 - } 626 - 627 - mas->alloc->request_count = count; 628 - } 629 - 630 - /* 631 - * mas_alloc_req() - get the requested number of allocations. 632 - * @mas: The maple state 633 - * 634 - * The alloc count is either stored directly in @mas, or in 635 - * @mas->alloc->request_count if there is at least one node allocated. Decode 636 - * the request count if it's stored directly in @mas->alloc. 637 - * 638 - * Return: The allocation request count. 639 - */ 640 - static inline unsigned int mas_alloc_req(const struct ma_state *mas) 641 - { 642 - if ((unsigned long)mas->alloc & 0x1) 643 - return (unsigned long)(mas->alloc) >> 1; 644 - else if (mas->alloc) 645 - return mas->alloc->request_count; 646 - return 0; 647 591 } 648 592 649 593 /* ··· 966 1032 } 967 1033 968 1034 /* 969 - * mte_set_gap() - Set a maple node gap. 970 - * @mn: The encoded maple node 971 - * @gap: The offset of the gap to set 972 - * @val: The gap value 973 - */ 974 - static inline void mte_set_gap(const struct maple_enode *mn, 975 - unsigned char gap, unsigned long val) 976 - { 977 - switch (mte_node_type(mn)) { 978 - default: 979 - break; 980 - case maple_arange_64: 981 - mte_to_node(mn)->ma64.gap[gap] = val; 982 - break; 983 - } 984 - } 985 - 986 - /* 987 1035 * mas_ascend() - Walk up a level of the tree. 988 1036 * @mas: The maple state 989 1037 * ··· 1068 1152 * 1069 1153 * Return: A pointer to a maple node. 1070 1154 */ 1071 - static inline struct maple_node *mas_pop_node(struct ma_state *mas) 1155 + static __always_inline struct maple_node *mas_pop_node(struct ma_state *mas) 1072 1156 { 1073 - struct maple_alloc *ret, *node = mas->alloc; 1074 - unsigned long total = mas_allocated(mas); 1075 - unsigned int req = mas_alloc_req(mas); 1157 + struct maple_node *ret; 1076 1158 1077 - /* nothing or a request pending. */ 1078 - if (WARN_ON(!total)) 1159 + if (mas->alloc) { 1160 + ret = mas->alloc; 1161 + mas->alloc = NULL; 1162 + goto out; 1163 + } 1164 + 1165 + if (WARN_ON_ONCE(!mas->sheaf)) 1079 1166 return NULL; 1080 1167 1081 - if (total == 1) { 1082 - /* single allocation in this ma_state */ 1083 - mas->alloc = NULL; 1084 - ret = node; 1085 - goto single_node; 1086 - } 1168 + ret = kmem_cache_alloc_from_sheaf(maple_node_cache, GFP_NOWAIT, mas->sheaf); 1087 1169 1088 - if (node->node_count == 1) { 1089 - /* Single allocation in this node. */ 1090 - mas->alloc = node->slot[0]; 1091 - mas->alloc->total = node->total - 1; 1092 - ret = node; 1093 - goto new_head; 1094 - } 1095 - node->total--; 1096 - ret = node->slot[--node->node_count]; 1097 - node->slot[node->node_count] = NULL; 1098 - 1099 - single_node: 1100 - new_head: 1101 - if (req) { 1102 - req++; 1103 - mas_set_alloc_req(mas, req); 1104 - } 1105 - 1170 + out: 1106 1171 memset(ret, 0, sizeof(*ret)); 1107 - return (struct maple_node *)ret; 1108 - } 1109 - 1110 - /* 1111 - * mas_push_node() - Push a node back on the maple state allocation. 1112 - * @mas: The maple state 1113 - * @used: The used maple node 1114 - * 1115 - * Stores the maple node back into @mas->alloc for reuse. Updates allocated and 1116 - * requested node count as necessary. 1117 - */ 1118 - static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) 1119 - { 1120 - struct maple_alloc *reuse = (struct maple_alloc *)used; 1121 - struct maple_alloc *head = mas->alloc; 1122 - unsigned long count; 1123 - unsigned int requested = mas_alloc_req(mas); 1124 - 1125 - count = mas_allocated(mas); 1126 - 1127 - reuse->request_count = 0; 1128 - reuse->node_count = 0; 1129 - if (count) { 1130 - if (head->node_count < MAPLE_ALLOC_SLOTS) { 1131 - head->slot[head->node_count++] = reuse; 1132 - head->total++; 1133 - goto done; 1134 - } 1135 - reuse->slot[0] = head; 1136 - reuse->node_count = 1; 1137 - } 1138 - 1139 - reuse->total = count + 1; 1140 - mas->alloc = reuse; 1141 - done: 1142 - if (requested > 1) 1143 - mas_set_alloc_req(mas, requested - 1); 1172 + return ret; 1144 1173 } 1145 1174 1146 1175 /* ··· 1095 1234 */ 1096 1235 static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) 1097 1236 { 1098 - struct maple_alloc *node; 1099 - unsigned long allocated = mas_allocated(mas); 1100 - unsigned int requested = mas_alloc_req(mas); 1101 - unsigned int count; 1102 - void **slots = NULL; 1103 - unsigned int max_req = 0; 1104 - 1105 - if (!requested) 1237 + if (!mas->node_request) 1106 1238 return; 1107 1239 1108 - mas_set_alloc_req(mas, 0); 1109 - if (mas->mas_flags & MA_STATE_PREALLOC) { 1110 - if (allocated) 1240 + if (mas->node_request == 1) { 1241 + if (mas->sheaf) 1242 + goto use_sheaf; 1243 + 1244 + if (mas->alloc) 1111 1245 return; 1112 - WARN_ON(!allocated); 1246 + 1247 + mas->alloc = mt_alloc_one(gfp); 1248 + if (!mas->alloc) 1249 + goto error; 1250 + 1251 + mas->node_request = 0; 1252 + return; 1113 1253 } 1114 1254 1115 - if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) { 1116 - node = (struct maple_alloc *)mt_alloc_one(gfp); 1117 - if (!node) 1118 - goto nomem_one; 1255 + use_sheaf: 1256 + if (unlikely(mas->alloc)) { 1257 + kfree(mas->alloc); 1258 + mas->alloc = NULL; 1259 + } 1119 1260 1120 - if (allocated) { 1121 - node->slot[0] = mas->alloc; 1122 - node->node_count = 1; 1123 - } else { 1124 - node->node_count = 0; 1261 + if (mas->sheaf) { 1262 + unsigned long refill; 1263 + 1264 + refill = mas->node_request; 1265 + if (kmem_cache_sheaf_size(mas->sheaf) >= refill) { 1266 + mas->node_request = 0; 1267 + return; 1125 1268 } 1126 1269 1127 - mas->alloc = node; 1128 - node->total = ++allocated; 1129 - node->request_count = 0; 1130 - requested--; 1270 + if (mt_refill_sheaf(gfp, &mas->sheaf, refill)) 1271 + goto error; 1272 + 1273 + mas->node_request = 0; 1274 + return; 1131 1275 } 1132 1276 1133 - node = mas->alloc; 1134 - while (requested) { 1135 - max_req = MAPLE_ALLOC_SLOTS - node->node_count; 1136 - slots = (void **)&node->slot[node->node_count]; 1137 - max_req = min(requested, max_req); 1138 - count = mt_alloc_bulk(gfp, max_req, slots); 1139 - if (!count) 1140 - goto nomem_bulk; 1141 - 1142 - if (node->node_count == 0) { 1143 - node->slot[0]->node_count = 0; 1144 - node->slot[0]->request_count = 0; 1145 - } 1146 - 1147 - node->node_count += count; 1148 - allocated += count; 1149 - /* find a non-full node*/ 1150 - do { 1151 - node = node->slot[0]; 1152 - } while (unlikely(node->node_count == MAPLE_ALLOC_SLOTS)); 1153 - requested -= count; 1277 + mas->sheaf = mt_get_sheaf(gfp, mas->node_request); 1278 + if (likely(mas->sheaf)) { 1279 + mas->node_request = 0; 1280 + return; 1154 1281 } 1155 - mas->alloc->total = allocated; 1156 - return; 1157 1282 1158 - nomem_bulk: 1159 - /* Clean up potential freed allocations on bulk failure */ 1160 - memset(slots, 0, max_req * sizeof(unsigned long)); 1161 - mas->alloc->total = allocated; 1162 - nomem_one: 1163 - mas_set_alloc_req(mas, requested); 1283 + error: 1164 1284 mas_set_err(mas, -ENOMEM); 1285 + } 1286 + 1287 + static inline void mas_empty_nodes(struct ma_state *mas) 1288 + { 1289 + mas->node_request = 0; 1290 + if (mas->sheaf) { 1291 + mt_return_sheaf(mas->sheaf); 1292 + mas->sheaf = NULL; 1293 + } 1294 + 1295 + if (mas->alloc) { 1296 + kfree(mas->alloc); 1297 + mas->alloc = NULL; 1298 + } 1165 1299 } 1166 1300 1167 1301 /* ··· 1169 1313 */ 1170 1314 static inline void mas_free(struct ma_state *mas, struct maple_enode *used) 1171 1315 { 1172 - struct maple_node *tmp = mte_to_node(used); 1173 - 1174 - if (mt_in_rcu(mas->tree)) 1175 - ma_free_rcu(tmp); 1176 - else 1177 - mas_push_node(mas, tmp); 1178 - } 1179 - 1180 - /* 1181 - * mas_node_count_gfp() - Check if enough nodes are allocated and request more 1182 - * if there is not enough nodes. 1183 - * @mas: The maple state 1184 - * @count: The number of nodes needed 1185 - * @gfp: the gfp flags 1186 - */ 1187 - static void mas_node_count_gfp(struct ma_state *mas, int count, gfp_t gfp) 1188 - { 1189 - unsigned long allocated = mas_allocated(mas); 1190 - 1191 - if (allocated < count) { 1192 - mas_set_alloc_req(mas, count - allocated); 1193 - mas_alloc_nodes(mas, gfp); 1194 - } 1195 - } 1196 - 1197 - /* 1198 - * mas_node_count() - Check if enough nodes are allocated and request more if 1199 - * there is not enough nodes. 1200 - * @mas: The maple state 1201 - * @count: The number of nodes needed 1202 - * 1203 - * Note: Uses GFP_NOWAIT | __GFP_NOWARN for gfp flags. 1204 - */ 1205 - static void mas_node_count(struct ma_state *mas, int count) 1206 - { 1207 - return mas_node_count_gfp(mas, count, GFP_NOWAIT | __GFP_NOWARN); 1316 + ma_free_rcu(mte_to_node(used)); 1208 1317 } 1209 1318 1210 1319 /* ··· 1699 1878 * end on a NULL entry, with the exception of the left-most leaf. The 1700 1879 * limitation means that the split of a node must be checked for this condition 1701 1880 * and be able to put more data in one direction or the other. 1702 - */ 1703 - if (unlikely((mas->mas_flags & MA_STATE_BULK))) { 1704 - *mid_split = 0; 1705 - split = b_end - mt_min_slots[bn->type]; 1706 - 1707 - if (!ma_is_leaf(bn->type)) 1708 - return split; 1709 - 1710 - mas->mas_flags |= MA_STATE_REBALANCE; 1711 - if (!bn->slot[split]) 1712 - split--; 1713 - return split; 1714 - } 1715 - 1716 - /* 1881 + * 1717 1882 * Although extremely rare, it is possible to enter what is known as the 3-way 1718 1883 * split scenario. The 3-way split comes about by means of a store of a range 1719 1884 * that overwrites the end and beginning of two full nodes. The result is a set ··· 1847 2040 } 1848 2041 1849 2042 /* 1850 - * mas_bulk_rebalance() - Rebalance the end of a tree after a bulk insert. 1851 - * @mas: The maple state 1852 - * @end: The maple node end 1853 - * @mt: The maple node type 1854 - */ 1855 - static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end, 1856 - enum maple_type mt) 1857 - { 1858 - if (!(mas->mas_flags & MA_STATE_BULK)) 1859 - return; 1860 - 1861 - if (mte_is_root(mas->node)) 1862 - return; 1863 - 1864 - if (end > mt_min_slots[mt]) { 1865 - mas->mas_flags &= ~MA_STATE_REBALANCE; 1866 - return; 1867 - } 1868 - } 1869 - 1870 - /* 1871 2043 * mas_store_b_node() - Store an @entry into the b_node while also copying the 1872 2044 * data from a maple encoded node. 1873 2045 * @wr_mas: the maple write state ··· 1895 2109 /* Handle new range ending before old range ends */ 1896 2110 piv = mas_safe_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type); 1897 2111 if (piv > mas->last) { 1898 - if (piv == ULONG_MAX) 1899 - mas_bulk_rebalance(mas, b_node->b_end, wr_mas->type); 1900 - 1901 2112 if (offset_end != slot) 1902 2113 wr_mas->content = mas_slot_locked(mas, wr_mas->slots, 1903 2114 offset_end); ··· 2306 2523 enode = tmp_mas->node; 2307 2524 tmp = mte_to_node(enode); 2308 2525 mte_set_node_dead(enode); 2309 - if (in_rcu) 2310 - ma_free_rcu(tmp); 2311 - else 2312 - mas_push_node(mas, tmp); 2526 + ma_free_rcu(tmp); 2313 2527 } 2314 2528 2315 2529 /* ··· 2789 3009 } 2790 3010 2791 3011 return mas_spanning_rebalance(mas, &mast, empty_count); 2792 - } 2793 - 2794 - /* 2795 - * mas_destroy_rebalance() - Rebalance left-most node while destroying the maple 2796 - * state. 2797 - * @mas: The maple state 2798 - * @end: The end of the left-most node. 2799 - * 2800 - * During a mass-insert event (such as forking), it may be necessary to 2801 - * rebalance the left-most node when it is not sufficient. 2802 - */ 2803 - static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end) 2804 - { 2805 - enum maple_type mt = mte_node_type(mas->node); 2806 - struct maple_node reuse, *newnode, *parent, *new_left, *left, *node; 2807 - struct maple_enode *eparent, *old_eparent; 2808 - unsigned char offset, tmp, split = mt_slots[mt] / 2; 2809 - void __rcu **l_slots, **slots; 2810 - unsigned long *l_pivs, *pivs, gap; 2811 - bool in_rcu = mt_in_rcu(mas->tree); 2812 - unsigned char new_height = mas_mt_height(mas); 2813 - 2814 - MA_STATE(l_mas, mas->tree, mas->index, mas->last); 2815 - 2816 - l_mas = *mas; 2817 - mas_prev_sibling(&l_mas); 2818 - 2819 - /* set up node. */ 2820 - if (in_rcu) { 2821 - newnode = mas_pop_node(mas); 2822 - } else { 2823 - newnode = &reuse; 2824 - } 2825 - 2826 - node = mas_mn(mas); 2827 - newnode->parent = node->parent; 2828 - slots = ma_slots(newnode, mt); 2829 - pivs = ma_pivots(newnode, mt); 2830 - left = mas_mn(&l_mas); 2831 - l_slots = ma_slots(left, mt); 2832 - l_pivs = ma_pivots(left, mt); 2833 - if (!l_slots[split]) 2834 - split++; 2835 - tmp = mas_data_end(&l_mas) - split; 2836 - 2837 - memcpy(slots, l_slots + split + 1, sizeof(void *) * tmp); 2838 - memcpy(pivs, l_pivs + split + 1, sizeof(unsigned long) * tmp); 2839 - pivs[tmp] = l_mas.max; 2840 - memcpy(slots + tmp, ma_slots(node, mt), sizeof(void *) * end); 2841 - memcpy(pivs + tmp, ma_pivots(node, mt), sizeof(unsigned long) * end); 2842 - 2843 - l_mas.max = l_pivs[split]; 2844 - mas->min = l_mas.max + 1; 2845 - old_eparent = mt_mk_node(mte_parent(l_mas.node), 2846 - mas_parent_type(&l_mas, l_mas.node)); 2847 - tmp += end; 2848 - if (!in_rcu) { 2849 - unsigned char max_p = mt_pivots[mt]; 2850 - unsigned char max_s = mt_slots[mt]; 2851 - 2852 - if (tmp < max_p) 2853 - memset(pivs + tmp, 0, 2854 - sizeof(unsigned long) * (max_p - tmp)); 2855 - 2856 - if (tmp < mt_slots[mt]) 2857 - memset(slots + tmp, 0, sizeof(void *) * (max_s - tmp)); 2858 - 2859 - memcpy(node, newnode, sizeof(struct maple_node)); 2860 - ma_set_meta(node, mt, 0, tmp - 1); 2861 - mte_set_pivot(old_eparent, mte_parent_slot(l_mas.node), 2862 - l_pivs[split]); 2863 - 2864 - /* Remove data from l_pivs. */ 2865 - tmp = split + 1; 2866 - memset(l_pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp)); 2867 - memset(l_slots + tmp, 0, sizeof(void *) * (max_s - tmp)); 2868 - ma_set_meta(left, mt, 0, split); 2869 - eparent = old_eparent; 2870 - 2871 - goto done; 2872 - } 2873 - 2874 - /* RCU requires replacing both l_mas, mas, and parent. */ 2875 - mas->node = mt_mk_node(newnode, mt); 2876 - ma_set_meta(newnode, mt, 0, tmp); 2877 - 2878 - new_left = mas_pop_node(mas); 2879 - new_left->parent = left->parent; 2880 - mt = mte_node_type(l_mas.node); 2881 - slots = ma_slots(new_left, mt); 2882 - pivs = ma_pivots(new_left, mt); 2883 - memcpy(slots, l_slots, sizeof(void *) * split); 2884 - memcpy(pivs, l_pivs, sizeof(unsigned long) * split); 2885 - ma_set_meta(new_left, mt, 0, split); 2886 - l_mas.node = mt_mk_node(new_left, mt); 2887 - 2888 - /* replace parent. */ 2889 - offset = mte_parent_slot(mas->node); 2890 - mt = mas_parent_type(&l_mas, l_mas.node); 2891 - parent = mas_pop_node(mas); 2892 - slots = ma_slots(parent, mt); 2893 - pivs = ma_pivots(parent, mt); 2894 - memcpy(parent, mte_to_node(old_eparent), sizeof(struct maple_node)); 2895 - rcu_assign_pointer(slots[offset], mas->node); 2896 - rcu_assign_pointer(slots[offset - 1], l_mas.node); 2897 - pivs[offset - 1] = l_mas.max; 2898 - eparent = mt_mk_node(parent, mt); 2899 - done: 2900 - gap = mas_leaf_max_gap(mas); 2901 - mte_set_gap(eparent, mte_parent_slot(mas->node), gap); 2902 - gap = mas_leaf_max_gap(&l_mas); 2903 - mte_set_gap(eparent, mte_parent_slot(l_mas.node), gap); 2904 - mas_ascend(mas); 2905 - 2906 - if (in_rcu) { 2907 - mas_replace_node(mas, old_eparent, new_height); 2908 - mas_adopt_children(mas, mas->node); 2909 - } 2910 - 2911 - mas_update_gap(mas); 2912 3012 } 2913 3013 2914 3014 /* ··· 3497 3837 3498 3838 if (mas->last == wr_mas->end_piv) 3499 3839 offset_end++; /* don't copy this offset */ 3500 - else if (unlikely(wr_mas->r_max == ULONG_MAX)) 3501 - mas_bulk_rebalance(mas, mas->end, wr_mas->type); 3502 3840 3503 3841 /* set up node. */ 3504 3842 if (in_rcu) { ··· 3832 4174 * 3833 4175 * Return: Number of nodes required for preallocation. 3834 4176 */ 3835 - static inline int mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry) 4177 + static inline void mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry) 3836 4178 { 3837 4179 struct ma_state *mas = wr_mas->mas; 3838 4180 unsigned char height = mas_mt_height(mas); ··· 3878 4220 WARN_ON_ONCE(1); 3879 4221 } 3880 4222 3881 - return ret; 4223 + mas->node_request = ret; 3882 4224 } 3883 4225 3884 4226 /* ··· 3913 4255 new_end = mas_wr_new_end(wr_mas); 3914 4256 /* Potential spanning rebalance collapsing a node */ 3915 4257 if (new_end < mt_min_slots[wr_mas->type]) { 3916 - if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK)) 4258 + if (!mte_is_root(mas->node)) 3917 4259 return wr_rebalance; 3918 4260 return wr_node_store; 3919 4261 } ··· 3939 4281 */ 3940 4282 static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry) 3941 4283 { 3942 - int request; 4284 + struct ma_state *mas = wr_mas->mas; 3943 4285 3944 4286 mas_wr_prealloc_setup(wr_mas); 3945 - wr_mas->mas->store_type = mas_wr_store_type(wr_mas); 3946 - request = mas_prealloc_calc(wr_mas, entry); 3947 - if (!request) 4287 + mas->store_type = mas_wr_store_type(wr_mas); 4288 + mas_prealloc_calc(wr_mas, entry); 4289 + if (!mas->node_request) 3948 4290 return; 3949 4291 3950 - mas_node_count(wr_mas->mas, request); 4292 + mas_alloc_nodes(mas, GFP_NOWAIT); 3951 4293 } 3952 4294 3953 4295 /** ··· 4939 5281 mt_free_bulk(node->slot_len, slots); 4940 5282 4941 5283 free_leaf: 4942 - mt_free_rcu(&node->rcu); 5284 + kfree(node); 4943 5285 } 4944 5286 4945 5287 static inline void __rcu **mte_destroy_descend(struct maple_enode **enode, ··· 5023 5365 5024 5366 free_leaf: 5025 5367 if (free) 5026 - mt_free_rcu(&node->rcu); 5368 + kfree(node); 5027 5369 else 5028 5370 mt_clear_meta(mt, node, node->type); 5029 5371 } ··· 5060 5402 */ 5061 5403 void *mas_store(struct ma_state *mas, void *entry) 5062 5404 { 5063 - int request; 5064 5405 MA_WR_STATE(wr_mas, mas, entry); 5065 5406 5066 5407 trace_ma_write(__func__, mas, 0, entry); ··· 5089 5432 return wr_mas.content; 5090 5433 } 5091 5434 5092 - request = mas_prealloc_calc(&wr_mas, entry); 5093 - if (!request) 5435 + mas_prealloc_calc(&wr_mas, entry); 5436 + if (!mas->node_request) 5094 5437 goto store; 5095 5438 5096 - mas_node_count(mas, request); 5439 + mas_alloc_nodes(mas, GFP_NOWAIT); 5097 5440 if (mas_is_err(mas)) 5098 5441 return NULL; 5099 5442 ··· 5181 5524 int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) 5182 5525 { 5183 5526 MA_WR_STATE(wr_mas, mas, entry); 5184 - int ret = 0; 5185 - int request; 5186 5527 5187 5528 mas_wr_prealloc_setup(&wr_mas); 5188 5529 mas->store_type = mas_wr_store_type(&wr_mas); 5189 - request = mas_prealloc_calc(&wr_mas, entry); 5190 - if (!request) 5530 + mas_prealloc_calc(&wr_mas, entry); 5531 + if (!mas->node_request) 5191 5532 goto set_flag; 5192 5533 5193 5534 mas->mas_flags &= ~MA_STATE_PREALLOC; 5194 - mas_node_count_gfp(mas, request, gfp); 5535 + mas_alloc_nodes(mas, gfp); 5195 5536 if (mas_is_err(mas)) { 5196 - mas_set_alloc_req(mas, 0); 5197 - ret = xa_err(mas->node); 5537 + int ret = xa_err(mas->node); 5538 + 5539 + mas->node_request = 0; 5198 5540 mas_destroy(mas); 5199 5541 mas_reset(mas); 5200 5542 return ret; ··· 5201 5545 5202 5546 set_flag: 5203 5547 mas->mas_flags |= MA_STATE_PREALLOC; 5204 - return ret; 5548 + return 0; 5205 5549 } 5206 5550 EXPORT_SYMBOL_GPL(mas_preallocate); 5207 5551 ··· 5215 5559 */ 5216 5560 void mas_destroy(struct ma_state *mas) 5217 5561 { 5218 - struct maple_alloc *node; 5219 - unsigned long total; 5220 - 5221 - /* 5222 - * When using mas_for_each() to insert an expected number of elements, 5223 - * it is possible that the number inserted is less than the expected 5224 - * number. To fix an invalid final node, a check is performed here to 5225 - * rebalance the previous node with the final node. 5226 - */ 5227 - if (mas->mas_flags & MA_STATE_REBALANCE) { 5228 - unsigned char end; 5229 - if (mas_is_err(mas)) 5230 - mas_reset(mas); 5231 - mas_start(mas); 5232 - mtree_range_walk(mas); 5233 - end = mas->end + 1; 5234 - if (end < mt_min_slot_count(mas->node) - 1) 5235 - mas_destroy_rebalance(mas, end); 5236 - 5237 - mas->mas_flags &= ~MA_STATE_REBALANCE; 5238 - } 5239 - mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC); 5240 - 5241 - total = mas_allocated(mas); 5242 - while (total) { 5243 - node = mas->alloc; 5244 - mas->alloc = node->slot[0]; 5245 - if (node->node_count > 1) { 5246 - size_t count = node->node_count - 1; 5247 - 5248 - mt_free_bulk(count, (void __rcu **)&node->slot[1]); 5249 - total -= count; 5250 - } 5251 - mt_free_one(ma_mnode_ptr(node)); 5252 - total--; 5253 - } 5254 - 5255 - mas->alloc = NULL; 5562 + mas->mas_flags &= ~MA_STATE_PREALLOC; 5563 + mas_empty_nodes(mas); 5256 5564 } 5257 5565 EXPORT_SYMBOL_GPL(mas_destroy); 5258 - 5259 - /* 5260 - * mas_expected_entries() - Set the expected number of entries that will be inserted. 5261 - * @mas: The maple state 5262 - * @nr_entries: The number of expected entries. 5263 - * 5264 - * This will attempt to pre-allocate enough nodes to store the expected number 5265 - * of entries. The allocations will occur using the bulk allocator interface 5266 - * for speed. Please call mas_destroy() on the @mas after inserting the entries 5267 - * to ensure any unused nodes are freed. 5268 - * 5269 - * Return: 0 on success, -ENOMEM if memory could not be allocated. 5270 - */ 5271 - int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries) 5272 - { 5273 - int nonleaf_cap = MAPLE_ARANGE64_SLOTS - 2; 5274 - struct maple_enode *enode = mas->node; 5275 - int nr_nodes; 5276 - int ret; 5277 - 5278 - /* 5279 - * Sometimes it is necessary to duplicate a tree to a new tree, such as 5280 - * forking a process and duplicating the VMAs from one tree to a new 5281 - * tree. When such a situation arises, it is known that the new tree is 5282 - * not going to be used until the entire tree is populated. For 5283 - * performance reasons, it is best to use a bulk load with RCU disabled. 5284 - * This allows for optimistic splitting that favours the left and reuse 5285 - * of nodes during the operation. 5286 - */ 5287 - 5288 - /* Optimize splitting for bulk insert in-order */ 5289 - mas->mas_flags |= MA_STATE_BULK; 5290 - 5291 - /* 5292 - * Avoid overflow, assume a gap between each entry and a trailing null. 5293 - * If this is wrong, it just means allocation can happen during 5294 - * insertion of entries. 5295 - */ 5296 - nr_nodes = max(nr_entries, nr_entries * 2 + 1); 5297 - if (!mt_is_alloc(mas->tree)) 5298 - nonleaf_cap = MAPLE_RANGE64_SLOTS - 2; 5299 - 5300 - /* Leaves; reduce slots to keep space for expansion */ 5301 - nr_nodes = DIV_ROUND_UP(nr_nodes, MAPLE_RANGE64_SLOTS - 2); 5302 - /* Internal nodes */ 5303 - nr_nodes += DIV_ROUND_UP(nr_nodes, nonleaf_cap); 5304 - /* Add working room for split (2 nodes) + new parents */ 5305 - mas_node_count_gfp(mas, nr_nodes + 3, GFP_KERNEL); 5306 - 5307 - /* Detect if allocations run out */ 5308 - mas->mas_flags |= MA_STATE_PREALLOC; 5309 - 5310 - if (!mas_is_err(mas)) 5311 - return 0; 5312 - 5313 - ret = xa_err(mas->node); 5314 - mas->node = enode; 5315 - mas_destroy(mas); 5316 - return ret; 5317 - 5318 - } 5319 - EXPORT_SYMBOL_GPL(mas_expected_entries); 5320 5566 5321 5567 static void mas_may_activate(struct ma_state *mas) 5322 5568 { ··· 5851 6293 mas_alloc_nodes(mas, gfp); 5852 6294 } 5853 6295 5854 - if (!mas_allocated(mas)) 6296 + if (!mas->sheaf && !mas->alloc) 5855 6297 return false; 5856 6298 5857 6299 mas->status = ma_start; ··· 5860 6302 5861 6303 void __init maple_tree_init(void) 5862 6304 { 6305 + struct kmem_cache_args args = { 6306 + .align = sizeof(struct maple_node), 6307 + .sheaf_capacity = 32, 6308 + }; 6309 + 5863 6310 maple_node_cache = kmem_cache_create("maple_node", 5864 - sizeof(struct maple_node), sizeof(struct maple_node), 5865 - SLAB_PANIC, NULL); 6311 + sizeof(struct maple_node), &args, 6312 + SLAB_PANIC); 5866 6313 } 5867 6314 5868 6315 /** ··· 6200 6637 } 6201 6638 6202 6639 node = mte_to_node(mas->node); 6203 - mt_free_one(node); 6640 + kfree(node); 6204 6641 } 6205 6642 6206 6643 /* ··· 6241 6678 struct maple_node *node = mte_to_node(mas->node); 6242 6679 struct maple_node *new_node = mte_to_node(new_mas->node); 6243 6680 enum maple_type type; 6244 - unsigned char request, count, i; 6681 + unsigned char count, i; 6245 6682 void __rcu **slots; 6246 6683 void __rcu **new_slots; 6247 6684 unsigned long val; ··· 6249 6686 /* Allocate memory for child nodes. */ 6250 6687 type = mte_node_type(mas->node); 6251 6688 new_slots = ma_slots(new_node, type); 6252 - request = mas_data_end(mas) + 1; 6253 - count = mt_alloc_bulk(gfp, request, (void **)new_slots); 6254 - if (unlikely(count < request)) { 6255 - memset(new_slots, 0, request * sizeof(void *)); 6256 - mas_set_err(mas, -ENOMEM); 6689 + count = mas->node_request = mas_data_end(mas) + 1; 6690 + mas_alloc_nodes(mas, gfp); 6691 + if (unlikely(mas_is_err(mas))) 6257 6692 return; 6258 - } 6259 6693 6260 - /* Restore node type information in slots. */ 6261 6694 slots = ma_slots(node, type); 6262 6695 for (i = 0; i < count; i++) { 6263 6696 val = (unsigned long)mt_slot_locked(mas->tree, slots, i); 6264 6697 val &= MAPLE_NODE_MASK; 6265 - ((unsigned long *)new_slots)[i] |= val; 6698 + new_slots[i] = ma_mnode_ptr((unsigned long)mas_pop_node(mas) | 6699 + val); 6266 6700 } 6267 6701 } 6268 6702 ··· 6313 6753 /* Only allocate child nodes for non-leaf nodes. */ 6314 6754 mas_dup_alloc(mas, new_mas, gfp); 6315 6755 if (unlikely(mas_is_err(mas))) 6316 - return; 6756 + goto empty_mas; 6317 6757 } else { 6318 6758 /* 6319 6759 * This is the last leaf node and duplication is ··· 6346 6786 /* Make them the same height */ 6347 6787 new_mas->tree->ma_flags = mas->tree->ma_flags; 6348 6788 rcu_assign_pointer(new_mas->tree->ma_root, root); 6789 + empty_mas: 6790 + mas_empty_nodes(mas); 6349 6791 } 6350 6792 6351 6793 /** ··· 7245 7683 7246 7684 pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end, 7247 7685 mas->index, mas->last); 7248 - pr_err(" min=%lx max=%lx alloc=" PTR_FMT ", depth=%u, flags=%x\n", 7249 - mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags); 7686 + pr_err(" min=%lx max=%lx sheaf=" PTR_FMT ", request %lu depth=%u, flags=%x\n", 7687 + mas->min, mas->max, mas->sheaf, mas->node_request, mas->depth, 7688 + mas->mas_flags); 7250 7689 if (mas->index > mas->last) 7251 7690 pr_err("Check index & last\n"); 7252 7691 }
-137
lib/test_maple_tree.c
··· 2746 2746 mtree_test_erase(mt, ULONG_MAX - 10); 2747 2747 } 2748 2748 2749 - /* duplicate the tree with a specific gap */ 2750 - static noinline void __init check_dup_gaps(struct maple_tree *mt, 2751 - unsigned long nr_entries, bool zero_start, 2752 - unsigned long gap) 2753 - { 2754 - unsigned long i = 0; 2755 - struct maple_tree newmt; 2756 - int ret; 2757 - void *tmp; 2758 - MA_STATE(mas, mt, 0, 0); 2759 - MA_STATE(newmas, &newmt, 0, 0); 2760 - struct rw_semaphore newmt_lock; 2761 - 2762 - init_rwsem(&newmt_lock); 2763 - mt_set_external_lock(&newmt, &newmt_lock); 2764 - 2765 - if (!zero_start) 2766 - i = 1; 2767 - 2768 - mt_zero_nr_tallocated(); 2769 - for (; i <= nr_entries; i++) 2770 - mtree_store_range(mt, i*10, (i+1)*10 - gap, 2771 - xa_mk_value(i), GFP_KERNEL); 2772 - 2773 - mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); 2774 - mt_set_non_kernel(99999); 2775 - down_write(&newmt_lock); 2776 - ret = mas_expected_entries(&newmas, nr_entries); 2777 - mt_set_non_kernel(0); 2778 - MT_BUG_ON(mt, ret != 0); 2779 - 2780 - rcu_read_lock(); 2781 - mas_for_each(&mas, tmp, ULONG_MAX) { 2782 - newmas.index = mas.index; 2783 - newmas.last = mas.last; 2784 - mas_store(&newmas, tmp); 2785 - } 2786 - rcu_read_unlock(); 2787 - mas_destroy(&newmas); 2788 - 2789 - __mt_destroy(&newmt); 2790 - up_write(&newmt_lock); 2791 - } 2792 - 2793 - /* Duplicate many sizes of trees. Mainly to test expected entry values */ 2794 - static noinline void __init check_dup(struct maple_tree *mt) 2795 - { 2796 - int i; 2797 - int big_start = 100010; 2798 - 2799 - /* Check with a value at zero */ 2800 - for (i = 10; i < 1000; i++) { 2801 - mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); 2802 - check_dup_gaps(mt, i, true, 5); 2803 - mtree_destroy(mt); 2804 - rcu_barrier(); 2805 - } 2806 - 2807 - cond_resched(); 2808 - mt_cache_shrink(); 2809 - /* Check with a value at zero, no gap */ 2810 - for (i = 1000; i < 2000; i++) { 2811 - mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); 2812 - check_dup_gaps(mt, i, true, 0); 2813 - mtree_destroy(mt); 2814 - rcu_barrier(); 2815 - } 2816 - 2817 - cond_resched(); 2818 - mt_cache_shrink(); 2819 - /* Check with a value at zero and unreasonably large */ 2820 - for (i = big_start; i < big_start + 10; i++) { 2821 - mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); 2822 - check_dup_gaps(mt, i, true, 5); 2823 - mtree_destroy(mt); 2824 - rcu_barrier(); 2825 - } 2826 - 2827 - cond_resched(); 2828 - mt_cache_shrink(); 2829 - /* Small to medium size not starting at zero*/ 2830 - for (i = 200; i < 1000; i++) { 2831 - mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); 2832 - check_dup_gaps(mt, i, false, 5); 2833 - mtree_destroy(mt); 2834 - rcu_barrier(); 2835 - } 2836 - 2837 - cond_resched(); 2838 - mt_cache_shrink(); 2839 - /* Unreasonably large not starting at zero*/ 2840 - for (i = big_start; i < big_start + 10; i++) { 2841 - mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); 2842 - check_dup_gaps(mt, i, false, 5); 2843 - mtree_destroy(mt); 2844 - rcu_barrier(); 2845 - cond_resched(); 2846 - mt_cache_shrink(); 2847 - } 2848 - 2849 - /* Check non-allocation tree not starting at zero */ 2850 - for (i = 1500; i < 3000; i++) { 2851 - mt_init_flags(mt, 0); 2852 - check_dup_gaps(mt, i, false, 5); 2853 - mtree_destroy(mt); 2854 - rcu_barrier(); 2855 - cond_resched(); 2856 - if (i % 2 == 0) 2857 - mt_cache_shrink(); 2858 - } 2859 - 2860 - mt_cache_shrink(); 2861 - /* Check non-allocation tree starting at zero */ 2862 - for (i = 200; i < 1000; i++) { 2863 - mt_init_flags(mt, 0); 2864 - check_dup_gaps(mt, i, true, 5); 2865 - mtree_destroy(mt); 2866 - rcu_barrier(); 2867 - cond_resched(); 2868 - } 2869 - 2870 - mt_cache_shrink(); 2871 - /* Unreasonably large */ 2872 - for (i = big_start + 5; i < big_start + 10; i++) { 2873 - mt_init_flags(mt, 0); 2874 - check_dup_gaps(mt, i, true, 5); 2875 - mtree_destroy(mt); 2876 - rcu_barrier(); 2877 - mt_cache_shrink(); 2878 - cond_resched(); 2879 - } 2880 - } 2881 - 2882 2749 static noinline void __init check_bnode_min_spanning(struct maple_tree *mt) 2883 2750 { 2884 2751 int i = 50; ··· 3942 4075 3943 4076 mt_init_flags(&tree, 0); 3944 4077 check_fuzzer(&tree); 3945 - mtree_destroy(&tree); 3946 - 3947 - mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); 3948 - check_dup(&tree); 3949 4078 mtree_destroy(&tree); 3950 4079 3951 4080 mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
+1
mm/Kconfig
··· 194 194 195 195 config SLUB 196 196 def_bool y 197 + select IRQ_WORK 197 198 198 199 config KVFREE_RCU_BATCHED 199 200 def_bool y
+4
mm/internal.h
··· 842 842 #define alloc_frozen_pages(...) \ 843 843 alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__)) 844 844 845 + struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); 846 + #define alloc_frozen_pages_nolock(...) \ 847 + alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__)) 848 + 845 849 extern void zone_pcp_reset(struct zone *zone); 846 850 extern void zone_pcp_disable(struct zone *zone); 847 851 extern void zone_pcp_enable(struct zone *zone);
+4 -1
mm/kasan/common.c
··· 252 252 } 253 253 254 254 bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, 255 - bool still_accessible) 255 + bool still_accessible, bool no_quarantine) 256 256 { 257 257 if (!kasan_arch_is_ready() || is_kfence_address(object)) 258 258 return false; ··· 273 273 return false; 274 274 275 275 poison_slab_object(cache, object, init); 276 + 277 + if (no_quarantine) 278 + return false; 276 279 277 280 /* 278 281 * If the object is put into quarantine, do not let slab put the object
+32 -23
mm/page_alloc.c
··· 7478 7478 7479 7479 #endif /* CONFIG_UNACCEPTED_MEMORY */ 7480 7480 7481 - /** 7482 - * alloc_pages_nolock - opportunistic reentrant allocation from any context 7483 - * @nid: node to allocate from 7484 - * @order: allocation order size 7485 - * 7486 - * Allocates pages of a given order from the given node. This is safe to 7487 - * call from any context (from atomic, NMI, and also reentrant 7488 - * allocator -> tracepoint -> alloc_pages_nolock_noprof). 7489 - * Allocation is best effort and to be expected to fail easily so nobody should 7490 - * rely on the success. Failures are not reported via warn_alloc(). 7491 - * See always fail conditions below. 7492 - * 7493 - * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN. 7494 - * It means ENOMEM. There is no reason to call it again and expect !NULL. 7495 - */ 7496 - struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) 7481 + struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order) 7497 7482 { 7498 7483 /* 7499 7484 * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed. ··· 7500 7515 * specify it here to highlight that alloc_pages_nolock() 7501 7516 * doesn't want to deplete reserves. 7502 7517 */ 7503 - gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC 7504 - | __GFP_ACCOUNT; 7518 + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC | __GFP_COMP 7519 + | gfp_flags; 7505 7520 unsigned int alloc_flags = ALLOC_TRYLOCK; 7506 7521 struct alloc_context ac = { }; 7507 7522 struct page *page; 7508 7523 7524 + VM_WARN_ON_ONCE(gfp_flags & ~__GFP_ACCOUNT); 7509 7525 /* 7510 7526 * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is 7511 7527 * unsafe in NMI. If spin_trylock() is called from hard IRQ the current ··· 7541 7555 7542 7556 /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */ 7543 7557 7544 - if (page) 7545 - set_page_refcounted(page); 7546 - 7547 - if (memcg_kmem_online() && page && 7558 + if (memcg_kmem_online() && page && (gfp_flags & __GFP_ACCOUNT) && 7548 7559 unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) { 7549 - free_pages_nolock(page, order); 7560 + __free_frozen_pages(page, order, FPI_TRYLOCK); 7550 7561 page = NULL; 7551 7562 } 7552 7563 trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); 7553 7564 kmsan_alloc_page(page, order, alloc_gfp); 7554 7565 return page; 7555 7566 } 7567 + /** 7568 + * alloc_pages_nolock - opportunistic reentrant allocation from any context 7569 + * @gfp_flags: GFP flags. Only __GFP_ACCOUNT allowed. 7570 + * @nid: node to allocate from 7571 + * @order: allocation order size 7572 + * 7573 + * Allocates pages of a given order from the given node. This is safe to 7574 + * call from any context (from atomic, NMI, and also reentrant 7575 + * allocator -> tracepoint -> alloc_pages_nolock_noprof). 7576 + * Allocation is best effort and to be expected to fail easily so nobody should 7577 + * rely on the success. Failures are not reported via warn_alloc(). 7578 + * See always fail conditions below. 7579 + * 7580 + * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN. 7581 + * It means ENOMEM. There is no reason to call it again and expect !NULL. 7582 + */ 7583 + struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order) 7584 + { 7585 + struct page *page; 7586 + 7587 + page = alloc_frozen_pages_nolock_noprof(gfp_flags, nid, order); 7588 + if (page) 7589 + set_page_refcounted(page); 7590 + return page; 7591 + } 7592 + EXPORT_SYMBOL_GPL(alloc_pages_nolock_noprof);
+18 -2
mm/slab.h
··· 57 57 struct { 58 58 union { 59 59 struct list_head slab_list; 60 + struct { /* For deferred deactivate_slab() */ 61 + struct llist_node llnode; 62 + void *flush_freelist; 63 + }; 60 64 #ifdef CONFIG_SLUB_CPU_PARTIAL 61 65 struct { 62 66 struct slab *next; ··· 238 234 struct kmem_cache { 239 235 #ifndef CONFIG_SLUB_TINY 240 236 struct kmem_cache_cpu __percpu *cpu_slab; 237 + struct lock_class_key lock_key; 241 238 #endif 239 + struct slub_percpu_sheaves __percpu *cpu_sheaves; 242 240 /* Used for retrieving partial slabs, etc. */ 243 241 slab_flags_t flags; 244 242 unsigned long min_partial; ··· 254 248 /* Number of per cpu partial slabs to keep around */ 255 249 unsigned int cpu_partial_slabs; 256 250 #endif 251 + unsigned int sheaf_capacity; 257 252 struct kmem_cache_order_objects oo; 258 253 259 254 /* Allocation and freeing of slabs */ ··· 440 433 return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT)); 441 434 } 442 435 436 + bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj); 437 + void flush_all_rcu_sheaves(void); 438 + 443 439 #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ 444 440 SLAB_CACHE_DMA32 | SLAB_PANIC | \ 445 441 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \ ··· 536 526 unsigned long obj_exts = READ_ONCE(slab->obj_exts); 537 527 538 528 #ifdef CONFIG_MEMCG 539 - VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS), 540 - slab_page(slab)); 529 + /* 530 + * obj_exts should be either NULL, a valid pointer with 531 + * MEMCG_DATA_OBJEXTS bit set or be equal to OBJEXTS_ALLOC_FAIL. 532 + */ 533 + VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS) && 534 + obj_exts != OBJEXTS_ALLOC_FAIL, slab_page(slab)); 541 535 VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab)); 542 536 #endif 543 537 return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK); ··· 669 655 670 656 void __check_heap_object(const void *ptr, unsigned long n, 671 657 const struct slab *slab, bool to_user); 658 + 659 + void defer_free_barrier(void); 672 660 673 661 static inline bool slub_debug_orig_size(struct kmem_cache *s) 674 662 {
+36 -1
mm/slab_common.c
··· 163 163 return 1; 164 164 #endif 165 165 166 + if (s->cpu_sheaves) 167 + return 1; 168 + 166 169 /* 167 170 * We may have set a slab to be unmergeable during bootstrap. 168 171 */ ··· 324 321 object_size - args->usersize < args->useroffset)) 325 322 args->usersize = args->useroffset = 0; 326 323 327 - if (!args->usersize) 324 + if (!args->usersize && !args->sheaf_capacity) 328 325 s = __kmem_cache_alias(name, object_size, args->align, flags, 329 326 args->ctor); 330 327 if (s) ··· 509 506 */ 510 507 rcu_barrier(); 511 508 } 509 + 510 + /* Wait for deferred work from kmalloc/kfree_nolock() */ 511 + defer_free_barrier(); 512 512 513 513 cpus_read_lock(); 514 514 mutex_lock(&slab_mutex); ··· 1611 1605 kvfree_rcu_list(head); 1612 1606 } 1613 1607 1608 + static bool kfree_rcu_sheaf(void *obj) 1609 + { 1610 + struct kmem_cache *s; 1611 + struct folio *folio; 1612 + struct slab *slab; 1613 + 1614 + if (is_vmalloc_addr(obj)) 1615 + return false; 1616 + 1617 + folio = virt_to_folio(obj); 1618 + if (unlikely(!folio_test_slab(folio))) 1619 + return false; 1620 + 1621 + slab = folio_slab(folio); 1622 + s = slab->slab_cache; 1623 + if (s->cpu_sheaves) { 1624 + if (likely(!IS_ENABLED(CONFIG_NUMA) || 1625 + slab_nid(slab) == numa_mem_id())) 1626 + return __kfree_rcu_sheaf(s, obj); 1627 + } 1628 + 1629 + return false; 1630 + } 1631 + 1614 1632 static bool 1615 1633 need_offload_krc(struct kfree_rcu_cpu *krcp) 1616 1634 { ··· 1979 1949 if (!head) 1980 1950 might_sleep(); 1981 1951 1952 + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(ptr)) 1953 + return; 1954 + 1982 1955 // Queue the object but don't yet schedule the batch. 1983 1956 if (debug_rcu_head_queue(ptr)) { 1984 1957 // Probable double kfree_rcu(), just leak. ··· 2055 2022 struct kfree_rcu_cpu *krcp; 2056 2023 bool queued; 2057 2024 int i, cpu; 2025 + 2026 + flush_all_rcu_sheaves(); 2058 2027 2059 2028 /* 2060 2029 * Firstly we detach objects and queue them over an RCU-batch
+2226 -135
mm/slub.c
··· 44 44 #include <kunit/test.h> 45 45 #include <kunit/test-bug.h> 46 46 #include <linux/sort.h> 47 - 47 + #include <linux/irq_work.h> 48 + #include <linux/kprobes.h> 48 49 #include <linux/debugfs.h> 49 50 #include <trace/events/kmem.h> 50 51 ··· 364 363 #endif 365 364 366 365 enum stat_item { 366 + ALLOC_PCS, /* Allocation from percpu sheaf */ 367 367 ALLOC_FASTPATH, /* Allocation from cpu slab */ 368 368 ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ 369 + FREE_PCS, /* Free to percpu sheaf */ 370 + FREE_RCU_SHEAF, /* Free to rcu_free sheaf */ 371 + FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */ 369 372 FREE_FASTPATH, /* Free to cpu slab */ 370 373 FREE_SLOWPATH, /* Freeing not to cpu slab */ 371 374 FREE_FROZEN, /* Freeing to frozen slab */ ··· 394 389 CPU_PARTIAL_FREE, /* Refill cpu partial on free */ 395 390 CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ 396 391 CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ 392 + SHEAF_FLUSH, /* Objects flushed from a sheaf */ 393 + SHEAF_REFILL, /* Objects refilled to a sheaf */ 394 + SHEAF_ALLOC, /* Allocation of an empty sheaf */ 395 + SHEAF_FREE, /* Freeing of an empty sheaf */ 396 + BARN_GET, /* Got full sheaf from barn */ 397 + BARN_GET_FAIL, /* Failed to get full sheaf from barn */ 398 + BARN_PUT, /* Put full sheaf to barn */ 399 + BARN_PUT_FAIL, /* Failed to put full sheaf to barn */ 400 + SHEAF_PREFILL_FAST, /* Sheaf prefill grabbed the spare sheaf */ 401 + SHEAF_PREFILL_SLOW, /* Sheaf prefill found no spare sheaf */ 402 + SHEAF_PREFILL_OVERSIZE, /* Allocation of oversize sheaf for prefill */ 403 + SHEAF_RETURN_FAST, /* Sheaf return reattached spare sheaf */ 404 + SHEAF_RETURN_SLOW, /* Sheaf return could not reattach spare */ 397 405 NR_SLUB_STAT_ITEMS 398 406 }; 399 407 ··· 427 409 #ifdef CONFIG_SLUB_CPU_PARTIAL 428 410 struct slab *partial; /* Partially allocated slabs */ 429 411 #endif 430 - local_lock_t lock; /* Protects the fields above */ 412 + local_trylock_t lock; /* Protects the fields above */ 431 413 #ifdef CONFIG_SLUB_STATS 432 414 unsigned int stat[NR_SLUB_STAT_ITEMS]; 433 415 #endif ··· 453 435 #endif 454 436 } 455 437 438 + #define MAX_FULL_SHEAVES 10 439 + #define MAX_EMPTY_SHEAVES 10 440 + 441 + struct node_barn { 442 + spinlock_t lock; 443 + struct list_head sheaves_full; 444 + struct list_head sheaves_empty; 445 + unsigned int nr_full; 446 + unsigned int nr_empty; 447 + }; 448 + 449 + struct slab_sheaf { 450 + union { 451 + struct rcu_head rcu_head; 452 + struct list_head barn_list; 453 + /* only used for prefilled sheafs */ 454 + unsigned int capacity; 455 + }; 456 + struct kmem_cache *cache; 457 + unsigned int size; 458 + int node; /* only used for rcu_sheaf */ 459 + void *objects[]; 460 + }; 461 + 462 + struct slub_percpu_sheaves { 463 + local_trylock_t lock; 464 + struct slab_sheaf *main; /* never NULL when unlocked */ 465 + struct slab_sheaf *spare; /* empty or full, may be NULL */ 466 + struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */ 467 + }; 468 + 456 469 /* 457 470 * The slab lists for all objects. 458 471 */ ··· 496 447 atomic_long_t total_objects; 497 448 struct list_head full; 498 449 #endif 450 + struct node_barn *barn; 499 451 }; 500 452 501 453 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 502 454 { 503 455 return s->node[node]; 456 + } 457 + 458 + /* Get the barn of the current cpu's memory node */ 459 + static inline struct node_barn *get_barn(struct kmem_cache *s) 460 + { 461 + return get_node(s, numa_mem_id())->barn; 504 462 } 505 463 506 464 /* ··· 526 470 */ 527 471 static nodemask_t slab_nodes; 528 472 529 - #ifndef CONFIG_SLUB_TINY 530 473 /* 531 474 * Workqueue used for flush_cpu_slab(). 532 475 */ 533 476 static struct workqueue_struct *flushwq; 534 - #endif 477 + 478 + struct slub_flush_work { 479 + struct work_struct work; 480 + struct kmem_cache *s; 481 + bool skip; 482 + }; 483 + 484 + static DEFINE_MUTEX(flush_lock); 485 + static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); 535 486 536 487 /******************************************************************** 537 488 * Core slab cache functions ··· 885 822 } 886 823 887 824 #ifdef CONFIG_SLUB_DEBUG 825 + 826 + /* 827 + * For debugging context when we want to check if the struct slab pointer 828 + * appears to be valid. 829 + */ 830 + static inline bool validate_slab_ptr(struct slab *slab) 831 + { 832 + return PageSlab(slab_page(slab)); 833 + } 834 + 888 835 static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; 889 836 static DEFINE_SPINLOCK(object_map_lock); 890 837 ··· 1522 1449 return ret; 1523 1450 } 1524 1451 1452 + /* 1453 + * Checks if the slab state looks sane. Assumes the struct slab pointer 1454 + * was either obtained in a way that ensures it's valid, or validated 1455 + * by validate_slab_ptr() 1456 + */ 1525 1457 static int check_slab(struct kmem_cache *s, struct slab *slab) 1526 1458 { 1527 1459 int maxobj; 1528 - 1529 - if (!folio_test_slab(slab_folio(slab))) { 1530 - slab_err(s, slab, "Not a valid slab page"); 1531 - return 0; 1532 - } 1533 1460 1534 1461 maxobj = order_objects(slab_order(slab), s->size); 1535 1462 if (slab->objects > maxobj) { ··· 1726 1653 return true; 1727 1654 1728 1655 bad: 1729 - if (folio_test_slab(slab_folio(slab))) { 1730 - /* 1731 - * If this is a slab page then lets do the best we can 1732 - * to avoid issues in the future. Marking all objects 1733 - * as used avoids touching the remaining objects. 1734 - */ 1735 - slab_fix(s, "Marking all objects used"); 1736 - slab->inuse = slab->objects; 1737 - slab->freelist = NULL; 1738 - slab->frozen = 1; /* mark consistency-failed slab as frozen */ 1739 - } 1656 + /* 1657 + * Let's do the best we can to avoid issues in the future. Marking all 1658 + * objects as used avoids touching the remaining objects. 1659 + */ 1660 + slab_fix(s, "Marking all objects used"); 1661 + slab->inuse = slab->objects; 1662 + slab->freelist = NULL; 1663 + slab->frozen = 1; /* mark consistency-failed slab as frozen */ 1664 + 1740 1665 return false; 1741 1666 } 1742 1667 ··· 1755 1684 return 0; 1756 1685 1757 1686 if (unlikely(s != slab->slab_cache)) { 1758 - if (!folio_test_slab(slab_folio(slab))) { 1759 - slab_err(s, slab, "Attempt to free object(0x%p) outside of slab", 1760 - object); 1761 - } else if (!slab->slab_cache) { 1687 + if (!slab->slab_cache) { 1762 1688 slab_err(NULL, slab, "No slab cache for object 0x%p", 1763 1689 object); 1764 1690 } else { ··· 2057 1989 * objects with no tag reference. Mark all references in this 2058 1990 * vector as empty to avoid warnings later on. 2059 1991 */ 2060 - if (obj_exts & OBJEXTS_ALLOC_FAIL) { 1992 + if (obj_exts == OBJEXTS_ALLOC_FAIL) { 2061 1993 unsigned int i; 2062 1994 2063 1995 for (i = 0; i < objects; i++) ··· 2090 2022 int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, 2091 2023 gfp_t gfp, bool new_slab) 2092 2024 { 2025 + bool allow_spin = gfpflags_allow_spinning(gfp); 2093 2026 unsigned int objects = objs_per_slab(s, slab); 2094 2027 unsigned long new_exts; 2095 2028 unsigned long old_exts; ··· 2099 2030 gfp &= ~OBJCGS_CLEAR_MASK; 2100 2031 /* Prevent recursive extension vector allocation */ 2101 2032 gfp |= __GFP_NO_OBJ_EXT; 2102 - vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, 2103 - slab_nid(slab)); 2033 + 2034 + /* 2035 + * Note that allow_spin may be false during early boot and its 2036 + * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting 2037 + * architectures with cmpxchg16b, early obj_exts will be missing for 2038 + * very early allocations on those. 2039 + */ 2040 + if (unlikely(!allow_spin)) { 2041 + size_t sz = objects * sizeof(struct slabobj_ext); 2042 + 2043 + vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT, 2044 + slab_nid(slab)); 2045 + } else { 2046 + vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, 2047 + slab_nid(slab)); 2048 + } 2104 2049 if (!vec) { 2105 2050 /* Mark vectors which failed to allocate */ 2106 - if (new_slab) 2107 - mark_failed_objexts_alloc(slab); 2051 + mark_failed_objexts_alloc(slab); 2108 2052 2109 2053 return -ENOMEM; 2110 2054 } 2111 2055 2112 2056 new_exts = (unsigned long)vec; 2057 + if (unlikely(!allow_spin)) 2058 + new_exts |= OBJEXTS_NOSPIN_ALLOC; 2113 2059 #ifdef CONFIG_MEMCG 2114 2060 new_exts |= MEMCG_DATA_OBJEXTS; 2115 2061 #endif ··· 2145 2061 * objcg vector should be reused. 2146 2062 */ 2147 2063 mark_objexts_empty(vec); 2148 - kfree(vec); 2064 + if (unlikely(!allow_spin)) 2065 + kfree_nolock(vec); 2066 + else 2067 + kfree(vec); 2149 2068 return 0; 2150 2069 } 2151 2070 ··· 2172 2085 * the extension for obj_exts is expected to be NULL. 2173 2086 */ 2174 2087 mark_objexts_empty(obj_exts); 2175 - kfree(obj_exts); 2088 + if (unlikely(READ_ONCE(slab->obj_exts) & OBJEXTS_NOSPIN_ALLOC)) 2089 + kfree_nolock(obj_exts); 2090 + else 2091 + kfree(obj_exts); 2176 2092 slab->obj_exts = 0; 2177 2093 } 2178 2094 ··· 2509 2419 2510 2420 } 2511 2421 /* KASAN might put x into memory quarantine, delaying its reuse. */ 2512 - return !kasan_slab_free(s, x, init, still_accessible); 2422 + return !kasan_slab_free(s, x, init, still_accessible, false); 2513 2423 } 2514 2424 2515 2425 static __fastpath_inline ··· 2568 2478 return object; 2569 2479 } 2570 2480 2481 + static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) 2482 + { 2483 + struct slab_sheaf *sheaf = kzalloc(struct_size(sheaf, objects, 2484 + s->sheaf_capacity), gfp); 2485 + 2486 + if (unlikely(!sheaf)) 2487 + return NULL; 2488 + 2489 + sheaf->cache = s; 2490 + 2491 + stat(s, SHEAF_ALLOC); 2492 + 2493 + return sheaf; 2494 + } 2495 + 2496 + static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) 2497 + { 2498 + kfree(sheaf); 2499 + 2500 + stat(s, SHEAF_FREE); 2501 + } 2502 + 2503 + static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, 2504 + size_t size, void **p); 2505 + 2506 + 2507 + static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, 2508 + gfp_t gfp) 2509 + { 2510 + int to_fill = s->sheaf_capacity - sheaf->size; 2511 + int filled; 2512 + 2513 + if (!to_fill) 2514 + return 0; 2515 + 2516 + filled = __kmem_cache_alloc_bulk(s, gfp, to_fill, 2517 + &sheaf->objects[sheaf->size]); 2518 + 2519 + sheaf->size += filled; 2520 + 2521 + stat_add(s, SHEAF_REFILL, filled); 2522 + 2523 + if (filled < to_fill) 2524 + return -ENOMEM; 2525 + 2526 + return 0; 2527 + } 2528 + 2529 + 2530 + static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp) 2531 + { 2532 + struct slab_sheaf *sheaf = alloc_empty_sheaf(s, gfp); 2533 + 2534 + if (!sheaf) 2535 + return NULL; 2536 + 2537 + if (refill_sheaf(s, sheaf, gfp)) { 2538 + free_empty_sheaf(s, sheaf); 2539 + return NULL; 2540 + } 2541 + 2542 + return sheaf; 2543 + } 2544 + 2545 + /* 2546 + * Maximum number of objects freed during a single flush of main pcs sheaf. 2547 + * Translates directly to an on-stack array size. 2548 + */ 2549 + #define PCS_BATCH_MAX 32U 2550 + 2551 + static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p); 2552 + 2553 + /* 2554 + * Free all objects from the main sheaf. In order to perform 2555 + * __kmem_cache_free_bulk() outside of cpu_sheaves->lock, work in batches where 2556 + * object pointers are moved to a on-stack array under the lock. To bound the 2557 + * stack usage, limit each batch to PCS_BATCH_MAX. 2558 + * 2559 + * returns true if at least partially flushed 2560 + */ 2561 + static bool sheaf_flush_main(struct kmem_cache *s) 2562 + { 2563 + struct slub_percpu_sheaves *pcs; 2564 + unsigned int batch, remaining; 2565 + void *objects[PCS_BATCH_MAX]; 2566 + struct slab_sheaf *sheaf; 2567 + bool ret = false; 2568 + 2569 + next_batch: 2570 + if (!local_trylock(&s->cpu_sheaves->lock)) 2571 + return ret; 2572 + 2573 + pcs = this_cpu_ptr(s->cpu_sheaves); 2574 + sheaf = pcs->main; 2575 + 2576 + batch = min(PCS_BATCH_MAX, sheaf->size); 2577 + 2578 + sheaf->size -= batch; 2579 + memcpy(objects, sheaf->objects + sheaf->size, batch * sizeof(void *)); 2580 + 2581 + remaining = sheaf->size; 2582 + 2583 + local_unlock(&s->cpu_sheaves->lock); 2584 + 2585 + __kmem_cache_free_bulk(s, batch, &objects[0]); 2586 + 2587 + stat_add(s, SHEAF_FLUSH, batch); 2588 + 2589 + ret = true; 2590 + 2591 + if (remaining) 2592 + goto next_batch; 2593 + 2594 + return ret; 2595 + } 2596 + 2597 + /* 2598 + * Free all objects from a sheaf that's unused, i.e. not linked to any 2599 + * cpu_sheaves, so we need no locking and batching. The locking is also not 2600 + * necessary when flushing cpu's sheaves (both spare and main) during cpu 2601 + * hotremove as the cpu is not executing anymore. 2602 + */ 2603 + static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf) 2604 + { 2605 + if (!sheaf->size) 2606 + return; 2607 + 2608 + stat_add(s, SHEAF_FLUSH, sheaf->size); 2609 + 2610 + __kmem_cache_free_bulk(s, sheaf->size, &sheaf->objects[0]); 2611 + 2612 + sheaf->size = 0; 2613 + } 2614 + 2615 + static void __rcu_free_sheaf_prepare(struct kmem_cache *s, 2616 + struct slab_sheaf *sheaf) 2617 + { 2618 + bool init = slab_want_init_on_free(s); 2619 + void **p = &sheaf->objects[0]; 2620 + unsigned int i = 0; 2621 + 2622 + while (i < sheaf->size) { 2623 + struct slab *slab = virt_to_slab(p[i]); 2624 + 2625 + memcg_slab_free_hook(s, slab, p + i, 1); 2626 + alloc_tagging_slab_free_hook(s, slab, p + i, 1); 2627 + 2628 + if (unlikely(!slab_free_hook(s, p[i], init, true))) { 2629 + p[i] = p[--sheaf->size]; 2630 + continue; 2631 + } 2632 + 2633 + i++; 2634 + } 2635 + } 2636 + 2637 + static void rcu_free_sheaf_nobarn(struct rcu_head *head) 2638 + { 2639 + struct slab_sheaf *sheaf; 2640 + struct kmem_cache *s; 2641 + 2642 + sheaf = container_of(head, struct slab_sheaf, rcu_head); 2643 + s = sheaf->cache; 2644 + 2645 + __rcu_free_sheaf_prepare(s, sheaf); 2646 + 2647 + sheaf_flush_unused(s, sheaf); 2648 + 2649 + free_empty_sheaf(s, sheaf); 2650 + } 2651 + 2652 + /* 2653 + * Caller needs to make sure migration is disabled in order to fully flush 2654 + * single cpu's sheaves 2655 + * 2656 + * must not be called from an irq 2657 + * 2658 + * flushing operations are rare so let's keep it simple and flush to slabs 2659 + * directly, skipping the barn 2660 + */ 2661 + static void pcs_flush_all(struct kmem_cache *s) 2662 + { 2663 + struct slub_percpu_sheaves *pcs; 2664 + struct slab_sheaf *spare, *rcu_free; 2665 + 2666 + local_lock(&s->cpu_sheaves->lock); 2667 + pcs = this_cpu_ptr(s->cpu_sheaves); 2668 + 2669 + spare = pcs->spare; 2670 + pcs->spare = NULL; 2671 + 2672 + rcu_free = pcs->rcu_free; 2673 + pcs->rcu_free = NULL; 2674 + 2675 + local_unlock(&s->cpu_sheaves->lock); 2676 + 2677 + if (spare) { 2678 + sheaf_flush_unused(s, spare); 2679 + free_empty_sheaf(s, spare); 2680 + } 2681 + 2682 + if (rcu_free) 2683 + call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn); 2684 + 2685 + sheaf_flush_main(s); 2686 + } 2687 + 2688 + static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu) 2689 + { 2690 + struct slub_percpu_sheaves *pcs; 2691 + 2692 + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); 2693 + 2694 + /* The cpu is not executing anymore so we don't need pcs->lock */ 2695 + sheaf_flush_unused(s, pcs->main); 2696 + if (pcs->spare) { 2697 + sheaf_flush_unused(s, pcs->spare); 2698 + free_empty_sheaf(s, pcs->spare); 2699 + pcs->spare = NULL; 2700 + } 2701 + 2702 + if (pcs->rcu_free) { 2703 + call_rcu(&pcs->rcu_free->rcu_head, rcu_free_sheaf_nobarn); 2704 + pcs->rcu_free = NULL; 2705 + } 2706 + } 2707 + 2708 + static void pcs_destroy(struct kmem_cache *s) 2709 + { 2710 + int cpu; 2711 + 2712 + for_each_possible_cpu(cpu) { 2713 + struct slub_percpu_sheaves *pcs; 2714 + 2715 + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); 2716 + 2717 + /* can happen when unwinding failed create */ 2718 + if (!pcs->main) 2719 + continue; 2720 + 2721 + /* 2722 + * We have already passed __kmem_cache_shutdown() so everything 2723 + * was flushed and there should be no objects allocated from 2724 + * slabs, otherwise kmem_cache_destroy() would have aborted. 2725 + * Therefore something would have to be really wrong if the 2726 + * warnings here trigger, and we should rather leave objects and 2727 + * sheaves to leak in that case. 2728 + */ 2729 + 2730 + WARN_ON(pcs->spare); 2731 + WARN_ON(pcs->rcu_free); 2732 + 2733 + if (!WARN_ON(pcs->main->size)) { 2734 + free_empty_sheaf(s, pcs->main); 2735 + pcs->main = NULL; 2736 + } 2737 + } 2738 + 2739 + free_percpu(s->cpu_sheaves); 2740 + s->cpu_sheaves = NULL; 2741 + } 2742 + 2743 + static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) 2744 + { 2745 + struct slab_sheaf *empty = NULL; 2746 + unsigned long flags; 2747 + 2748 + if (!data_race(barn->nr_empty)) 2749 + return NULL; 2750 + 2751 + spin_lock_irqsave(&barn->lock, flags); 2752 + 2753 + if (likely(barn->nr_empty)) { 2754 + empty = list_first_entry(&barn->sheaves_empty, 2755 + struct slab_sheaf, barn_list); 2756 + list_del(&empty->barn_list); 2757 + barn->nr_empty--; 2758 + } 2759 + 2760 + spin_unlock_irqrestore(&barn->lock, flags); 2761 + 2762 + return empty; 2763 + } 2764 + 2765 + /* 2766 + * The following two functions are used mainly in cases where we have to undo an 2767 + * intended action due to a race or cpu migration. Thus they do not check the 2768 + * empty or full sheaf limits for simplicity. 2769 + */ 2770 + 2771 + static void barn_put_empty_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf) 2772 + { 2773 + unsigned long flags; 2774 + 2775 + spin_lock_irqsave(&barn->lock, flags); 2776 + 2777 + list_add(&sheaf->barn_list, &barn->sheaves_empty); 2778 + barn->nr_empty++; 2779 + 2780 + spin_unlock_irqrestore(&barn->lock, flags); 2781 + } 2782 + 2783 + static void barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf) 2784 + { 2785 + unsigned long flags; 2786 + 2787 + spin_lock_irqsave(&barn->lock, flags); 2788 + 2789 + list_add(&sheaf->barn_list, &barn->sheaves_full); 2790 + barn->nr_full++; 2791 + 2792 + spin_unlock_irqrestore(&barn->lock, flags); 2793 + } 2794 + 2795 + static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn) 2796 + { 2797 + struct slab_sheaf *sheaf = NULL; 2798 + unsigned long flags; 2799 + 2800 + if (!data_race(barn->nr_full) && !data_race(barn->nr_empty)) 2801 + return NULL; 2802 + 2803 + spin_lock_irqsave(&barn->lock, flags); 2804 + 2805 + if (barn->nr_full) { 2806 + sheaf = list_first_entry(&barn->sheaves_full, struct slab_sheaf, 2807 + barn_list); 2808 + list_del(&sheaf->barn_list); 2809 + barn->nr_full--; 2810 + } else if (barn->nr_empty) { 2811 + sheaf = list_first_entry(&barn->sheaves_empty, 2812 + struct slab_sheaf, barn_list); 2813 + list_del(&sheaf->barn_list); 2814 + barn->nr_empty--; 2815 + } 2816 + 2817 + spin_unlock_irqrestore(&barn->lock, flags); 2818 + 2819 + return sheaf; 2820 + } 2821 + 2822 + /* 2823 + * If a full sheaf is available, return it and put the supplied empty one to 2824 + * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't 2825 + * change. 2826 + */ 2827 + static struct slab_sheaf * 2828 + barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) 2829 + { 2830 + struct slab_sheaf *full = NULL; 2831 + unsigned long flags; 2832 + 2833 + if (!data_race(barn->nr_full)) 2834 + return NULL; 2835 + 2836 + spin_lock_irqsave(&barn->lock, flags); 2837 + 2838 + if (likely(barn->nr_full)) { 2839 + full = list_first_entry(&barn->sheaves_full, struct slab_sheaf, 2840 + barn_list); 2841 + list_del(&full->barn_list); 2842 + list_add(&empty->barn_list, &barn->sheaves_empty); 2843 + barn->nr_full--; 2844 + barn->nr_empty++; 2845 + } 2846 + 2847 + spin_unlock_irqrestore(&barn->lock, flags); 2848 + 2849 + return full; 2850 + } 2851 + 2852 + /* 2853 + * If an empty sheaf is available, return it and put the supplied full one to 2854 + * barn. But if there are too many full sheaves, reject this with -E2BIG. 2855 + */ 2856 + static struct slab_sheaf * 2857 + barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) 2858 + { 2859 + struct slab_sheaf *empty; 2860 + unsigned long flags; 2861 + 2862 + /* we don't repeat this check under barn->lock as it's not critical */ 2863 + if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES) 2864 + return ERR_PTR(-E2BIG); 2865 + if (!data_race(barn->nr_empty)) 2866 + return ERR_PTR(-ENOMEM); 2867 + 2868 + spin_lock_irqsave(&barn->lock, flags); 2869 + 2870 + if (likely(barn->nr_empty)) { 2871 + empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf, 2872 + barn_list); 2873 + list_del(&empty->barn_list); 2874 + list_add(&full->barn_list, &barn->sheaves_full); 2875 + barn->nr_empty--; 2876 + barn->nr_full++; 2877 + } else { 2878 + empty = ERR_PTR(-ENOMEM); 2879 + } 2880 + 2881 + spin_unlock_irqrestore(&barn->lock, flags); 2882 + 2883 + return empty; 2884 + } 2885 + 2886 + static void barn_init(struct node_barn *barn) 2887 + { 2888 + spin_lock_init(&barn->lock); 2889 + INIT_LIST_HEAD(&barn->sheaves_full); 2890 + INIT_LIST_HEAD(&barn->sheaves_empty); 2891 + barn->nr_full = 0; 2892 + barn->nr_empty = 0; 2893 + } 2894 + 2895 + static void barn_shrink(struct kmem_cache *s, struct node_barn *barn) 2896 + { 2897 + struct list_head empty_list; 2898 + struct list_head full_list; 2899 + struct slab_sheaf *sheaf, *sheaf2; 2900 + unsigned long flags; 2901 + 2902 + INIT_LIST_HEAD(&empty_list); 2903 + INIT_LIST_HEAD(&full_list); 2904 + 2905 + spin_lock_irqsave(&barn->lock, flags); 2906 + 2907 + list_splice_init(&barn->sheaves_full, &full_list); 2908 + barn->nr_full = 0; 2909 + list_splice_init(&barn->sheaves_empty, &empty_list); 2910 + barn->nr_empty = 0; 2911 + 2912 + spin_unlock_irqrestore(&barn->lock, flags); 2913 + 2914 + list_for_each_entry_safe(sheaf, sheaf2, &full_list, barn_list) { 2915 + sheaf_flush_unused(s, sheaf); 2916 + free_empty_sheaf(s, sheaf); 2917 + } 2918 + 2919 + list_for_each_entry_safe(sheaf, sheaf2, &empty_list, barn_list) 2920 + free_empty_sheaf(s, sheaf); 2921 + } 2922 + 2571 2923 /* 2572 2924 * Slab allocation and freeing 2573 2925 */ 2574 2926 static inline struct slab *alloc_slab_page(gfp_t flags, int node, 2575 - struct kmem_cache_order_objects oo) 2927 + struct kmem_cache_order_objects oo, 2928 + bool allow_spin) 2576 2929 { 2577 2930 struct folio *folio; 2578 2931 struct slab *slab; 2579 2932 unsigned int order = oo_order(oo); 2580 2933 2581 - if (node == NUMA_NO_NODE) 2934 + if (unlikely(!allow_spin)) 2935 + folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, 2936 + node, order); 2937 + else if (node == NUMA_NO_NODE) 2582 2938 folio = (struct folio *)alloc_frozen_pages(flags, order); 2583 2939 else 2584 2940 folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL); ··· 3174 2638 3175 2639 static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 3176 2640 { 2641 + bool allow_spin = gfpflags_allow_spinning(flags); 3177 2642 struct slab *slab; 3178 2643 struct kmem_cache_order_objects oo = s->oo; 3179 2644 gfp_t alloc_gfp; ··· 3194 2657 if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) 3195 2658 alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM; 3196 2659 3197 - slab = alloc_slab_page(alloc_gfp, node, oo); 2660 + /* 2661 + * __GFP_RECLAIM could be cleared on the first allocation attempt, 2662 + * so pass allow_spin flag directly. 2663 + */ 2664 + slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin); 3198 2665 if (unlikely(!slab)) { 3199 2666 oo = s->min; 3200 2667 alloc_gfp = flags; ··· 3206 2665 * Allocation may have failed due to fragmentation. 3207 2666 * Try a lower order alloc if possible 3208 2667 */ 3209 - slab = alloc_slab_page(alloc_gfp, node, oo); 2668 + slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin); 3210 2669 if (unlikely(!slab)) 3211 2670 return NULL; 3212 2671 stat(s, ORDER_FALLBACK); ··· 3357 2816 3358 2817 lockdep_assert_held(&n->list_lock); 3359 2818 2819 + #ifdef CONFIG_SLUB_DEBUG 2820 + if (s->flags & SLAB_CONSISTENCY_CHECKS) { 2821 + if (!validate_slab_ptr(slab)) { 2822 + slab_err(s, slab, "Not a valid slab page"); 2823 + return NULL; 2824 + } 2825 + } 2826 + #endif 2827 + 3360 2828 object = slab->freelist; 3361 2829 slab->freelist = get_freepointer(s, object); 3362 2830 slab->inuse++; 3363 2831 3364 2832 if (!alloc_debug_processing(s, slab, object, orig_size)) { 3365 - if (folio_test_slab(slab_folio(slab))) 3366 - remove_partial(n, slab); 2833 + remove_partial(n, slab); 3367 2834 return NULL; 3368 2835 } 3369 2836 ··· 3383 2834 return object; 3384 2835 } 3385 2836 2837 + static void defer_deactivate_slab(struct slab *slab, void *flush_freelist); 2838 + 3386 2839 /* 3387 2840 * Called only for kmem_cache_debug() caches to allocate from a freshly 3388 2841 * allocated slab. Allocate a single object instead of whole freelist 3389 2842 * and put the slab to the partial (or full) list. 3390 2843 */ 3391 - static void *alloc_single_from_new_slab(struct kmem_cache *s, 3392 - struct slab *slab, int orig_size) 2844 + static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, 2845 + int orig_size, gfp_t gfpflags) 3393 2846 { 2847 + bool allow_spin = gfpflags_allow_spinning(gfpflags); 3394 2848 int nid = slab_nid(slab); 3395 2849 struct kmem_cache_node *n = get_node(s, nid); 3396 2850 unsigned long flags; 3397 2851 void *object; 3398 2852 2853 + if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) { 2854 + /* Unlucky, discard newly allocated slab */ 2855 + slab->frozen = 1; 2856 + defer_deactivate_slab(slab, NULL); 2857 + return NULL; 2858 + } 3399 2859 3400 2860 object = slab->freelist; 3401 2861 slab->freelist = get_freepointer(s, object); 3402 2862 slab->inuse = 1; 3403 2863 3404 - if (!alloc_debug_processing(s, slab, object, orig_size)) 2864 + if (!alloc_debug_processing(s, slab, object, orig_size)) { 3405 2865 /* 3406 2866 * It's not really expected that this would fail on a 3407 2867 * freshly allocated slab, but a concurrent memory 3408 2868 * corruption in theory could cause that. 2869 + * Leak memory of allocated slab. 3409 2870 */ 2871 + if (!allow_spin) 2872 + spin_unlock_irqrestore(&n->list_lock, flags); 3410 2873 return NULL; 2874 + } 3411 2875 3412 - spin_lock_irqsave(&n->list_lock, flags); 2876 + if (allow_spin) 2877 + spin_lock_irqsave(&n->list_lock, flags); 3413 2878 3414 2879 if (slab->inuse == slab->objects) 3415 2880 add_full(s, n, slab); ··· 3464 2901 if (!n || !n->nr_partial) 3465 2902 return NULL; 3466 2903 3467 - spin_lock_irqsave(&n->list_lock, flags); 2904 + if (gfpflags_allow_spinning(pc->flags)) 2905 + spin_lock_irqsave(&n->list_lock, flags); 2906 + else if (!spin_trylock_irqsave(&n->list_lock, flags)) 2907 + return NULL; 3468 2908 list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { 3469 2909 if (!pfmemalloc_match(slab, pc->flags)) 3470 2910 continue; ··· 3635 3069 3636 3070 pr_info("%s %s: cmpxchg redo ", n, s->name); 3637 3071 3638 - #ifdef CONFIG_PREEMPTION 3639 - if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) 3072 + if (IS_ENABLED(CONFIG_PREEMPTION) && 3073 + tid_to_cpu(tid) != tid_to_cpu(actual_tid)) { 3640 3074 pr_warn("due to cpu change %d -> %d\n", 3641 3075 tid_to_cpu(tid), tid_to_cpu(actual_tid)); 3642 - else 3643 - #endif 3644 - if (tid_to_event(tid) != tid_to_event(actual_tid)) 3076 + } else if (tid_to_event(tid) != tid_to_event(actual_tid)) { 3645 3077 pr_warn("due to cpu running other code. Event %ld->%ld\n", 3646 3078 tid_to_event(tid), tid_to_event(actual_tid)); 3647 - else 3079 + } else { 3648 3080 pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", 3649 3081 actual_tid, tid, next_tid(tid)); 3082 + } 3650 3083 #endif 3651 3084 stat(s, CMPXCHG_DOUBLE_CPU_FAIL); 3652 3085 } 3653 3086 3654 3087 static void init_kmem_cache_cpus(struct kmem_cache *s) 3655 3088 { 3089 + #ifdef CONFIG_PREEMPT_RT 3090 + /* 3091 + * Register lockdep key for non-boot kmem caches to avoid 3092 + * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key() 3093 + */ 3094 + bool finegrain_lockdep = !init_section_contains(s, 1); 3095 + #else 3096 + /* 3097 + * Don't bother with different lockdep classes for each 3098 + * kmem_cache, since we only use local_trylock_irqsave(). 3099 + */ 3100 + bool finegrain_lockdep = false; 3101 + #endif 3656 3102 int cpu; 3657 3103 struct kmem_cache_cpu *c; 3658 3104 3105 + if (finegrain_lockdep) 3106 + lockdep_register_key(&s->lock_key); 3659 3107 for_each_possible_cpu(cpu) { 3660 3108 c = per_cpu_ptr(s->cpu_slab, cpu); 3661 - local_lock_init(&c->lock); 3109 + local_trylock_init(&c->lock); 3110 + if (finegrain_lockdep) 3111 + lockdep_set_class(&c->lock, &s->lock_key); 3662 3112 c->tid = init_tid(cpu); 3663 3113 } 3664 3114 } ··· 3765 3183 } 3766 3184 } 3767 3185 3186 + /* 3187 + * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock 3188 + * can be acquired without a deadlock before invoking the function. 3189 + * 3190 + * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is 3191 + * using local_lock_is_locked() properly before calling local_lock_cpu_slab(), 3192 + * and kmalloc() is not used in an unsupported context. 3193 + * 3194 + * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave(). 3195 + * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but 3196 + * lockdep_assert() will catch a bug in case: 3197 + * #1 3198 + * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock() 3199 + * or 3200 + * #2 3201 + * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() 3202 + * 3203 + * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt 3204 + * disabled context. The lock will always be acquired and if needed it 3205 + * block and sleep until the lock is available. 3206 + * #1 is possible in !PREEMPT_RT only. 3207 + * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock: 3208 + * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) -> 3209 + * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B) 3210 + * 3211 + * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B 3212 + */ 3213 + #if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP) 3214 + #define local_lock_cpu_slab(s, flags) \ 3215 + local_lock_irqsave(&(s)->cpu_slab->lock, flags) 3216 + #else 3217 + #define local_lock_cpu_slab(s, flags) \ 3218 + do { \ 3219 + bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \ 3220 + lockdep_assert(__l); \ 3221 + } while (0) 3222 + #endif 3223 + 3224 + #define local_unlock_cpu_slab(s, flags) \ 3225 + local_unlock_irqrestore(&(s)->cpu_slab->lock, flags) 3226 + 3768 3227 #ifdef CONFIG_SLUB_CPU_PARTIAL 3769 3228 static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) 3770 3229 { ··· 3890 3267 unsigned long flags; 3891 3268 int slabs = 0; 3892 3269 3893 - local_lock_irqsave(&s->cpu_slab->lock, flags); 3270 + local_lock_cpu_slab(s, flags); 3894 3271 3895 3272 oldslab = this_cpu_read(s->cpu_slab->partial); 3896 3273 ··· 3915 3292 3916 3293 this_cpu_write(s->cpu_slab->partial, slab); 3917 3294 3918 - local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3295 + local_unlock_cpu_slab(s, flags); 3919 3296 3920 3297 if (slab_to_put) { 3921 3298 __put_partials(s, slab_to_put); ··· 3972 3349 put_partials_cpu(s, c); 3973 3350 } 3974 3351 3975 - struct slub_flush_work { 3976 - struct work_struct work; 3977 - struct kmem_cache *s; 3978 - bool skip; 3979 - }; 3980 - 3981 - /* 3982 - * Flush cpu slab. 3983 - * 3984 - * Called from CPU work handler with migration disabled. 3985 - */ 3986 - static void flush_cpu_slab(struct work_struct *w) 3352 + static inline void flush_this_cpu_slab(struct kmem_cache *s) 3987 3353 { 3988 - struct kmem_cache *s; 3989 - struct kmem_cache_cpu *c; 3990 - struct slub_flush_work *sfw; 3991 - 3992 - sfw = container_of(w, struct slub_flush_work, work); 3993 - 3994 - s = sfw->s; 3995 - c = this_cpu_ptr(s->cpu_slab); 3354 + struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); 3996 3355 3997 3356 if (c->slab) 3998 3357 flush_slab(s, c); ··· 3989 3384 return c->slab || slub_percpu_partial(c); 3990 3385 } 3991 3386 3992 - static DEFINE_MUTEX(flush_lock); 3993 - static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); 3387 + #else /* CONFIG_SLUB_TINY */ 3388 + static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } 3389 + static inline bool has_cpu_slab(int cpu, struct kmem_cache *s) { return false; } 3390 + static inline void flush_this_cpu_slab(struct kmem_cache *s) { } 3391 + #endif /* CONFIG_SLUB_TINY */ 3392 + 3393 + static bool has_pcs_used(int cpu, struct kmem_cache *s) 3394 + { 3395 + struct slub_percpu_sheaves *pcs; 3396 + 3397 + if (!s->cpu_sheaves) 3398 + return false; 3399 + 3400 + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); 3401 + 3402 + return (pcs->spare || pcs->rcu_free || pcs->main->size); 3403 + } 3404 + 3405 + /* 3406 + * Flush cpu slab. 3407 + * 3408 + * Called from CPU work handler with migration disabled. 3409 + */ 3410 + static void flush_cpu_slab(struct work_struct *w) 3411 + { 3412 + struct kmem_cache *s; 3413 + struct slub_flush_work *sfw; 3414 + 3415 + sfw = container_of(w, struct slub_flush_work, work); 3416 + 3417 + s = sfw->s; 3418 + 3419 + if (s->cpu_sheaves) 3420 + pcs_flush_all(s); 3421 + 3422 + flush_this_cpu_slab(s); 3423 + } 3994 3424 3995 3425 static void flush_all_cpus_locked(struct kmem_cache *s) 3996 3426 { ··· 4037 3397 4038 3398 for_each_online_cpu(cpu) { 4039 3399 sfw = &per_cpu(slub_flush, cpu); 4040 - if (!has_cpu_slab(cpu, s)) { 3400 + if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) { 4041 3401 sfw->skip = true; 4042 3402 continue; 4043 3403 } ··· 4064 3424 cpus_read_unlock(); 4065 3425 } 4066 3426 3427 + static void flush_rcu_sheaf(struct work_struct *w) 3428 + { 3429 + struct slub_percpu_sheaves *pcs; 3430 + struct slab_sheaf *rcu_free; 3431 + struct slub_flush_work *sfw; 3432 + struct kmem_cache *s; 3433 + 3434 + sfw = container_of(w, struct slub_flush_work, work); 3435 + s = sfw->s; 3436 + 3437 + local_lock(&s->cpu_sheaves->lock); 3438 + pcs = this_cpu_ptr(s->cpu_sheaves); 3439 + 3440 + rcu_free = pcs->rcu_free; 3441 + pcs->rcu_free = NULL; 3442 + 3443 + local_unlock(&s->cpu_sheaves->lock); 3444 + 3445 + if (rcu_free) 3446 + call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn); 3447 + } 3448 + 3449 + 3450 + /* needed for kvfree_rcu_barrier() */ 3451 + void flush_all_rcu_sheaves(void) 3452 + { 3453 + struct slub_flush_work *sfw; 3454 + struct kmem_cache *s; 3455 + unsigned int cpu; 3456 + 3457 + cpus_read_lock(); 3458 + mutex_lock(&slab_mutex); 3459 + 3460 + list_for_each_entry(s, &slab_caches, list) { 3461 + if (!s->cpu_sheaves) 3462 + continue; 3463 + 3464 + mutex_lock(&flush_lock); 3465 + 3466 + for_each_online_cpu(cpu) { 3467 + sfw = &per_cpu(slub_flush, cpu); 3468 + 3469 + /* 3470 + * we don't check if rcu_free sheaf exists - racing 3471 + * __kfree_rcu_sheaf() might have just removed it. 3472 + * by executing flush_rcu_sheaf() on the cpu we make 3473 + * sure the __kfree_rcu_sheaf() finished its call_rcu() 3474 + */ 3475 + 3476 + INIT_WORK(&sfw->work, flush_rcu_sheaf); 3477 + sfw->s = s; 3478 + queue_work_on(cpu, flushwq, &sfw->work); 3479 + } 3480 + 3481 + for_each_online_cpu(cpu) { 3482 + sfw = &per_cpu(slub_flush, cpu); 3483 + flush_work(&sfw->work); 3484 + } 3485 + 3486 + mutex_unlock(&flush_lock); 3487 + } 3488 + 3489 + mutex_unlock(&slab_mutex); 3490 + cpus_read_unlock(); 3491 + 3492 + rcu_barrier(); 3493 + } 3494 + 4067 3495 /* 4068 3496 * Use the cpu notifier to insure that the cpu slabs are flushed when 4069 3497 * necessary. ··· 4141 3433 struct kmem_cache *s; 4142 3434 4143 3435 mutex_lock(&slab_mutex); 4144 - list_for_each_entry(s, &slab_caches, list) 3436 + list_for_each_entry(s, &slab_caches, list) { 4145 3437 __flush_cpu_slab(s, cpu); 3438 + if (s->cpu_sheaves) 3439 + __pcs_flush_all_cpu(s, cpu); 3440 + } 4146 3441 mutex_unlock(&slab_mutex); 4147 3442 return 0; 4148 3443 } 4149 - 4150 - #else /* CONFIG_SLUB_TINY */ 4151 - static inline void flush_all_cpus_locked(struct kmem_cache *s) { } 4152 - static inline void flush_all(struct kmem_cache *s) { } 4153 - static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } 4154 - static inline int slub_cpu_dead(unsigned int cpu) { return 0; } 4155 - #endif /* CONFIG_SLUB_TINY */ 4156 3444 4157 3445 /* 4158 3446 * Check if the objects in a per cpu structure fit numa ··· 4430 3726 static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 4431 3727 unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) 4432 3728 { 3729 + bool allow_spin = gfpflags_allow_spinning(gfpflags); 4433 3730 void *freelist; 4434 3731 struct slab *slab; 4435 3732 unsigned long flags; ··· 4456 3751 if (unlikely(!node_match(slab, node))) { 4457 3752 /* 4458 3753 * same as above but node_match() being false already 4459 - * implies node != NUMA_NO_NODE 3754 + * implies node != NUMA_NO_NODE. 3755 + * 3756 + * We don't strictly honor pfmemalloc and NUMA preferences 3757 + * when !allow_spin because: 3758 + * 3759 + * 1. Most kmalloc() users allocate objects on the local node, 3760 + * so kmalloc_nolock() tries not to interfere with them by 3761 + * deactivating the cpu slab. 3762 + * 3763 + * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause 3764 + * unnecessary slab allocations even when n->partial list 3765 + * is not empty. 4460 3766 */ 4461 - if (!node_isset(node, slab_nodes)) { 3767 + if (!node_isset(node, slab_nodes) || 3768 + !allow_spin) { 4462 3769 node = NUMA_NO_NODE; 4463 3770 } else { 4464 3771 stat(s, ALLOC_NODE_MISMATCH); ··· 4483 3766 * PFMEMALLOC but right now, we are losing the pfmemalloc 4484 3767 * information when the page leaves the per-cpu allocator 4485 3768 */ 4486 - if (unlikely(!pfmemalloc_match(slab, gfpflags))) 3769 + if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) 4487 3770 goto deactivate_slab; 4488 3771 4489 3772 /* must check again c->slab in case we got preempted and it changed */ 4490 - local_lock_irqsave(&s->cpu_slab->lock, flags); 3773 + local_lock_cpu_slab(s, flags); 3774 + 4491 3775 if (unlikely(slab != c->slab)) { 4492 - local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3776 + local_unlock_cpu_slab(s, flags); 4493 3777 goto reread_slab; 4494 3778 } 4495 3779 freelist = c->freelist; ··· 4502 3784 if (!freelist) { 4503 3785 c->slab = NULL; 4504 3786 c->tid = next_tid(c->tid); 4505 - local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3787 + local_unlock_cpu_slab(s, flags); 4506 3788 stat(s, DEACTIVATE_BYPASS); 4507 3789 goto new_slab; 4508 3790 } ··· 4521 3803 VM_BUG_ON(!c->slab->frozen); 4522 3804 c->freelist = get_freepointer(s, freelist); 4523 3805 c->tid = next_tid(c->tid); 4524 - local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3806 + local_unlock_cpu_slab(s, flags); 4525 3807 return freelist; 4526 3808 4527 3809 deactivate_slab: 4528 3810 4529 - local_lock_irqsave(&s->cpu_slab->lock, flags); 3811 + local_lock_cpu_slab(s, flags); 4530 3812 if (slab != c->slab) { 4531 - local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3813 + local_unlock_cpu_slab(s, flags); 4532 3814 goto reread_slab; 4533 3815 } 4534 3816 freelist = c->freelist; 4535 3817 c->slab = NULL; 4536 3818 c->freelist = NULL; 4537 3819 c->tid = next_tid(c->tid); 4538 - local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3820 + local_unlock_cpu_slab(s, flags); 4539 3821 deactivate_slab(s, slab, freelist); 4540 3822 4541 3823 new_slab: 4542 3824 4543 3825 #ifdef CONFIG_SLUB_CPU_PARTIAL 4544 3826 while (slub_percpu_partial(c)) { 4545 - local_lock_irqsave(&s->cpu_slab->lock, flags); 3827 + local_lock_cpu_slab(s, flags); 4546 3828 if (unlikely(c->slab)) { 4547 - local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3829 + local_unlock_cpu_slab(s, flags); 4548 3830 goto reread_slab; 4549 3831 } 4550 3832 if (unlikely(!slub_percpu_partial(c))) { 4551 - local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3833 + local_unlock_cpu_slab(s, flags); 4552 3834 /* we were preempted and partial list got empty */ 4553 3835 goto new_objects; 4554 3836 } ··· 4557 3839 slub_set_percpu_partial(c, slab); 4558 3840 4559 3841 if (likely(node_match(slab, node) && 4560 - pfmemalloc_match(slab, gfpflags))) { 3842 + pfmemalloc_match(slab, gfpflags)) || 3843 + !allow_spin) { 4561 3844 c->slab = slab; 4562 3845 freelist = get_freelist(s, slab); 4563 3846 VM_BUG_ON(!freelist); ··· 4566 3847 goto load_freelist; 4567 3848 } 4568 3849 4569 - local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3850 + local_unlock_cpu_slab(s, flags); 4570 3851 4571 3852 slab->next = NULL; 4572 3853 __put_partials(s, slab); ··· 4588 3869 * allocating new page from other nodes 4589 3870 */ 4590 3871 if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) 4591 - && try_thisnode)) 4592 - pc.flags = GFP_NOWAIT | __GFP_THISNODE; 3872 + && try_thisnode)) { 3873 + if (unlikely(!allow_spin)) 3874 + /* Do not upgrade gfp to NOWAIT from more restrictive mode */ 3875 + pc.flags = gfpflags | __GFP_THISNODE; 3876 + else 3877 + pc.flags = GFP_NOWAIT | __GFP_THISNODE; 3878 + } 4593 3879 4594 3880 pc.orig_size = orig_size; 4595 3881 slab = get_partial(s, node, &pc); ··· 4638 3914 stat(s, ALLOC_SLAB); 4639 3915 4640 3916 if (kmem_cache_debug(s)) { 4641 - freelist = alloc_single_from_new_slab(s, slab, orig_size); 3917 + freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); 4642 3918 4643 3919 if (unlikely(!freelist)) 4644 3920 goto new_objects; ··· 4661 3937 4662 3938 inc_slabs_node(s, slab_nid(slab), slab->objects); 4663 3939 4664 - if (unlikely(!pfmemalloc_match(slab, gfpflags))) { 3940 + if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) { 4665 3941 /* 4666 3942 * For !pfmemalloc_match() case we don't load freelist so that 4667 3943 * we don't make further mismatched allocations easier. ··· 4672 3948 4673 3949 retry_load_slab: 4674 3950 4675 - local_lock_irqsave(&s->cpu_slab->lock, flags); 3951 + local_lock_cpu_slab(s, flags); 4676 3952 if (unlikely(c->slab)) { 4677 3953 void *flush_freelist = c->freelist; 4678 3954 struct slab *flush_slab = c->slab; ··· 4681 3957 c->freelist = NULL; 4682 3958 c->tid = next_tid(c->tid); 4683 3959 4684 - local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3960 + local_unlock_cpu_slab(s, flags); 4685 3961 4686 - deactivate_slab(s, flush_slab, flush_freelist); 3962 + if (unlikely(!allow_spin)) { 3963 + /* Reentrant slub cannot take locks, defer */ 3964 + defer_deactivate_slab(flush_slab, flush_freelist); 3965 + } else { 3966 + deactivate_slab(s, flush_slab, flush_freelist); 3967 + } 4687 3968 4688 3969 stat(s, CPUSLAB_FLUSH); 4689 3970 ··· 4698 3969 4699 3970 goto load_freelist; 4700 3971 } 3972 + /* 3973 + * We disallow kprobes in ___slab_alloc() to prevent reentrance 3974 + * 3975 + * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of 3976 + * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf -> 3977 + * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast() 3978 + * manipulating c->freelist without lock. 3979 + * 3980 + * This does not prevent kprobe in functions called from ___slab_alloc() such as 3981 + * local_lock_irqsave() itself, and that is fine, we only need to protect the 3982 + * c->freelist manipulation in ___slab_alloc() itself. 3983 + */ 3984 + NOKPROBE_SYMBOL(___slab_alloc); 4701 3985 4702 3986 /* 4703 3987 * A wrapper for ___slab_alloc() for contexts where preemption is not yet ··· 4730 3988 */ 4731 3989 c = slub_get_cpu_ptr(s->cpu_slab); 4732 3990 #endif 4733 - 3991 + if (unlikely(!gfpflags_allow_spinning(gfpflags))) { 3992 + if (local_lock_is_locked(&s->cpu_slab->lock)) { 3993 + /* 3994 + * EBUSY is an internal signal to kmalloc_nolock() to 3995 + * retry a different bucket. It's not propagated 3996 + * to the caller. 3997 + */ 3998 + p = ERR_PTR(-EBUSY); 3999 + goto out; 4000 + } 4001 + } 4734 4002 p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); 4003 + out: 4735 4004 #ifdef CONFIG_PREEMPT_COUNT 4736 4005 slub_put_cpu_ptr(s->cpu_slab); 4737 4006 #endif ··· 4866 4113 return NULL; 4867 4114 } 4868 4115 4869 - object = alloc_single_from_new_slab(s, slab, orig_size); 4116 + object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); 4870 4117 4871 4118 return object; 4872 4119 } ··· 4945 4192 if (p[i] && init && (!kasan_init || 4946 4193 !kasan_has_integrated_init())) 4947 4194 memset(p[i], 0, zero_size); 4948 - kmemleak_alloc_recursive(p[i], s->object_size, 1, 4949 - s->flags, init_flags); 4195 + if (gfpflags_allow_spinning(flags)) 4196 + kmemleak_alloc_recursive(p[i], s->object_size, 1, 4197 + s->flags, init_flags); 4950 4198 kmsan_slab_alloc(s, p[i], init_flags); 4951 4199 alloc_tagging_slab_alloc_hook(s, p[i], flags); 4952 4200 } 4953 4201 4954 4202 return memcg_slab_post_alloc_hook(s, lru, flags, size, p); 4955 4203 } 4204 + 4205 + /* 4206 + * Replace the empty main sheaf with a (at least partially) full sheaf. 4207 + * 4208 + * Must be called with the cpu_sheaves local lock locked. If successful, returns 4209 + * the pcs pointer and the local lock locked (possibly on a different cpu than 4210 + * initially called). If not successful, returns NULL and the local lock 4211 + * unlocked. 4212 + */ 4213 + static struct slub_percpu_sheaves * 4214 + __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, gfp_t gfp) 4215 + { 4216 + struct slab_sheaf *empty = NULL; 4217 + struct slab_sheaf *full; 4218 + struct node_barn *barn; 4219 + bool can_alloc; 4220 + 4221 + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); 4222 + 4223 + if (pcs->spare && pcs->spare->size > 0) { 4224 + swap(pcs->main, pcs->spare); 4225 + return pcs; 4226 + } 4227 + 4228 + barn = get_barn(s); 4229 + 4230 + full = barn_replace_empty_sheaf(barn, pcs->main); 4231 + 4232 + if (full) { 4233 + stat(s, BARN_GET); 4234 + pcs->main = full; 4235 + return pcs; 4236 + } 4237 + 4238 + stat(s, BARN_GET_FAIL); 4239 + 4240 + can_alloc = gfpflags_allow_blocking(gfp); 4241 + 4242 + if (can_alloc) { 4243 + if (pcs->spare) { 4244 + empty = pcs->spare; 4245 + pcs->spare = NULL; 4246 + } else { 4247 + empty = barn_get_empty_sheaf(barn); 4248 + } 4249 + } 4250 + 4251 + local_unlock(&s->cpu_sheaves->lock); 4252 + 4253 + if (!can_alloc) 4254 + return NULL; 4255 + 4256 + if (empty) { 4257 + if (!refill_sheaf(s, empty, gfp)) { 4258 + full = empty; 4259 + } else { 4260 + /* 4261 + * we must be very low on memory so don't bother 4262 + * with the barn 4263 + */ 4264 + free_empty_sheaf(s, empty); 4265 + } 4266 + } else { 4267 + full = alloc_full_sheaf(s, gfp); 4268 + } 4269 + 4270 + if (!full) 4271 + return NULL; 4272 + 4273 + /* 4274 + * we can reach here only when gfpflags_allow_blocking 4275 + * so this must not be an irq 4276 + */ 4277 + local_lock(&s->cpu_sheaves->lock); 4278 + pcs = this_cpu_ptr(s->cpu_sheaves); 4279 + 4280 + /* 4281 + * If we are returning empty sheaf, we either got it from the 4282 + * barn or had to allocate one. If we are returning a full 4283 + * sheaf, it's due to racing or being migrated to a different 4284 + * cpu. Breaching the barn's sheaf limits should be thus rare 4285 + * enough so just ignore them to simplify the recovery. 4286 + */ 4287 + 4288 + if (pcs->main->size == 0) { 4289 + barn_put_empty_sheaf(barn, pcs->main); 4290 + pcs->main = full; 4291 + return pcs; 4292 + } 4293 + 4294 + if (!pcs->spare) { 4295 + pcs->spare = full; 4296 + return pcs; 4297 + } 4298 + 4299 + if (pcs->spare->size == 0) { 4300 + barn_put_empty_sheaf(barn, pcs->spare); 4301 + pcs->spare = full; 4302 + return pcs; 4303 + } 4304 + 4305 + barn_put_full_sheaf(barn, full); 4306 + stat(s, BARN_PUT); 4307 + 4308 + return pcs; 4309 + } 4310 + 4311 + static __fastpath_inline 4312 + void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) 4313 + { 4314 + struct slub_percpu_sheaves *pcs; 4315 + bool node_requested; 4316 + void *object; 4317 + 4318 + #ifdef CONFIG_NUMA 4319 + if (static_branch_unlikely(&strict_numa) && 4320 + node == NUMA_NO_NODE) { 4321 + 4322 + struct mempolicy *mpol = current->mempolicy; 4323 + 4324 + if (mpol) { 4325 + /* 4326 + * Special BIND rule support. If the local node 4327 + * is in permitted set then do not redirect 4328 + * to a particular node. 4329 + * Otherwise we apply the memory policy to get 4330 + * the node we need to allocate on. 4331 + */ 4332 + if (mpol->mode != MPOL_BIND || 4333 + !node_isset(numa_mem_id(), mpol->nodes)) 4334 + 4335 + node = mempolicy_slab_node(); 4336 + } 4337 + } 4338 + #endif 4339 + 4340 + node_requested = IS_ENABLED(CONFIG_NUMA) && node != NUMA_NO_NODE; 4341 + 4342 + /* 4343 + * We assume the percpu sheaves contain only local objects although it's 4344 + * not completely guaranteed, so we verify later. 4345 + */ 4346 + if (unlikely(node_requested && node != numa_mem_id())) 4347 + return NULL; 4348 + 4349 + if (!local_trylock(&s->cpu_sheaves->lock)) 4350 + return NULL; 4351 + 4352 + pcs = this_cpu_ptr(s->cpu_sheaves); 4353 + 4354 + if (unlikely(pcs->main->size == 0)) { 4355 + pcs = __pcs_replace_empty_main(s, pcs, gfp); 4356 + if (unlikely(!pcs)) 4357 + return NULL; 4358 + } 4359 + 4360 + object = pcs->main->objects[pcs->main->size - 1]; 4361 + 4362 + if (unlikely(node_requested)) { 4363 + /* 4364 + * Verify that the object was from the node we want. This could 4365 + * be false because of cpu migration during an unlocked part of 4366 + * the current allocation or previous freeing process. 4367 + */ 4368 + if (folio_nid(virt_to_folio(object)) != node) { 4369 + local_unlock(&s->cpu_sheaves->lock); 4370 + return NULL; 4371 + } 4372 + } 4373 + 4374 + pcs->main->size--; 4375 + 4376 + local_unlock(&s->cpu_sheaves->lock); 4377 + 4378 + stat(s, ALLOC_PCS); 4379 + 4380 + return object; 4381 + } 4382 + 4383 + static __fastpath_inline 4384 + unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p) 4385 + { 4386 + struct slub_percpu_sheaves *pcs; 4387 + struct slab_sheaf *main; 4388 + unsigned int allocated = 0; 4389 + unsigned int batch; 4390 + 4391 + next_batch: 4392 + if (!local_trylock(&s->cpu_sheaves->lock)) 4393 + return allocated; 4394 + 4395 + pcs = this_cpu_ptr(s->cpu_sheaves); 4396 + 4397 + if (unlikely(pcs->main->size == 0)) { 4398 + 4399 + struct slab_sheaf *full; 4400 + 4401 + if (pcs->spare && pcs->spare->size > 0) { 4402 + swap(pcs->main, pcs->spare); 4403 + goto do_alloc; 4404 + } 4405 + 4406 + full = barn_replace_empty_sheaf(get_barn(s), pcs->main); 4407 + 4408 + if (full) { 4409 + stat(s, BARN_GET); 4410 + pcs->main = full; 4411 + goto do_alloc; 4412 + } 4413 + 4414 + stat(s, BARN_GET_FAIL); 4415 + 4416 + local_unlock(&s->cpu_sheaves->lock); 4417 + 4418 + /* 4419 + * Once full sheaves in barn are depleted, let the bulk 4420 + * allocation continue from slab pages, otherwise we would just 4421 + * be copying arrays of pointers twice. 4422 + */ 4423 + return allocated; 4424 + } 4425 + 4426 + do_alloc: 4427 + 4428 + main = pcs->main; 4429 + batch = min(size, main->size); 4430 + 4431 + main->size -= batch; 4432 + memcpy(p, main->objects + main->size, batch * sizeof(void *)); 4433 + 4434 + local_unlock(&s->cpu_sheaves->lock); 4435 + 4436 + stat_add(s, ALLOC_PCS, batch); 4437 + 4438 + allocated += batch; 4439 + 4440 + if (batch < size) { 4441 + p += batch; 4442 + size -= batch; 4443 + goto next_batch; 4444 + } 4445 + 4446 + return allocated; 4447 + } 4448 + 4956 4449 4957 4450 /* 4958 4451 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) ··· 5224 4225 if (unlikely(object)) 5225 4226 goto out; 5226 4227 5227 - object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); 4228 + if (s->cpu_sheaves) 4229 + object = alloc_from_pcs(s, gfpflags, node); 4230 + 4231 + if (!object) 4232 + object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); 5228 4233 5229 4234 maybe_wipe_obj_freeptr(s, object); 5230 4235 init = slab_want_init_on_alloc(gfpflags, s); ··· 5300 4297 } 5301 4298 EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); 5302 4299 4300 + /* 4301 + * returns a sheaf that has at least the requested size 4302 + * when prefilling is needed, do so with given gfp flags 4303 + * 4304 + * return NULL if sheaf allocation or prefilling failed 4305 + */ 4306 + struct slab_sheaf * 4307 + kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) 4308 + { 4309 + struct slub_percpu_sheaves *pcs; 4310 + struct slab_sheaf *sheaf = NULL; 4311 + 4312 + if (unlikely(size > s->sheaf_capacity)) { 4313 + 4314 + /* 4315 + * slab_debug disables cpu sheaves intentionally so all 4316 + * prefilled sheaves become "oversize" and we give up on 4317 + * performance for the debugging. Same with SLUB_TINY. 4318 + * Creating a cache without sheaves and then requesting a 4319 + * prefilled sheaf is however not expected, so warn. 4320 + */ 4321 + WARN_ON_ONCE(s->sheaf_capacity == 0 && 4322 + !IS_ENABLED(CONFIG_SLUB_TINY) && 4323 + !(s->flags & SLAB_DEBUG_FLAGS)); 4324 + 4325 + sheaf = kzalloc(struct_size(sheaf, objects, size), gfp); 4326 + if (!sheaf) 4327 + return NULL; 4328 + 4329 + stat(s, SHEAF_PREFILL_OVERSIZE); 4330 + sheaf->cache = s; 4331 + sheaf->capacity = size; 4332 + 4333 + if (!__kmem_cache_alloc_bulk(s, gfp, size, 4334 + &sheaf->objects[0])) { 4335 + kfree(sheaf); 4336 + return NULL; 4337 + } 4338 + 4339 + sheaf->size = size; 4340 + 4341 + return sheaf; 4342 + } 4343 + 4344 + local_lock(&s->cpu_sheaves->lock); 4345 + pcs = this_cpu_ptr(s->cpu_sheaves); 4346 + 4347 + if (pcs->spare) { 4348 + sheaf = pcs->spare; 4349 + pcs->spare = NULL; 4350 + stat(s, SHEAF_PREFILL_FAST); 4351 + } else { 4352 + stat(s, SHEAF_PREFILL_SLOW); 4353 + sheaf = barn_get_full_or_empty_sheaf(get_barn(s)); 4354 + if (sheaf && sheaf->size) 4355 + stat(s, BARN_GET); 4356 + else 4357 + stat(s, BARN_GET_FAIL); 4358 + } 4359 + 4360 + local_unlock(&s->cpu_sheaves->lock); 4361 + 4362 + 4363 + if (!sheaf) 4364 + sheaf = alloc_empty_sheaf(s, gfp); 4365 + 4366 + if (sheaf && sheaf->size < size) { 4367 + if (refill_sheaf(s, sheaf, gfp)) { 4368 + sheaf_flush_unused(s, sheaf); 4369 + free_empty_sheaf(s, sheaf); 4370 + sheaf = NULL; 4371 + } 4372 + } 4373 + 4374 + if (sheaf) 4375 + sheaf->capacity = s->sheaf_capacity; 4376 + 4377 + return sheaf; 4378 + } 4379 + 4380 + /* 4381 + * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf() 4382 + * 4383 + * If the sheaf cannot simply become the percpu spare sheaf, but there's space 4384 + * for a full sheaf in the barn, we try to refill the sheaf back to the cache's 4385 + * sheaf_capacity to avoid handling partially full sheaves. 4386 + * 4387 + * If the refill fails because gfp is e.g. GFP_NOWAIT, or the barn is full, the 4388 + * sheaf is instead flushed and freed. 4389 + */ 4390 + void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, 4391 + struct slab_sheaf *sheaf) 4392 + { 4393 + struct slub_percpu_sheaves *pcs; 4394 + struct node_barn *barn; 4395 + 4396 + if (unlikely(sheaf->capacity != s->sheaf_capacity)) { 4397 + sheaf_flush_unused(s, sheaf); 4398 + kfree(sheaf); 4399 + return; 4400 + } 4401 + 4402 + local_lock(&s->cpu_sheaves->lock); 4403 + pcs = this_cpu_ptr(s->cpu_sheaves); 4404 + barn = get_barn(s); 4405 + 4406 + if (!pcs->spare) { 4407 + pcs->spare = sheaf; 4408 + sheaf = NULL; 4409 + stat(s, SHEAF_RETURN_FAST); 4410 + } 4411 + 4412 + local_unlock(&s->cpu_sheaves->lock); 4413 + 4414 + if (!sheaf) 4415 + return; 4416 + 4417 + stat(s, SHEAF_RETURN_SLOW); 4418 + 4419 + /* 4420 + * If the barn has too many full sheaves or we fail to refill the sheaf, 4421 + * simply flush and free it. 4422 + */ 4423 + if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES || 4424 + refill_sheaf(s, sheaf, gfp)) { 4425 + sheaf_flush_unused(s, sheaf); 4426 + free_empty_sheaf(s, sheaf); 4427 + return; 4428 + } 4429 + 4430 + barn_put_full_sheaf(barn, sheaf); 4431 + stat(s, BARN_PUT); 4432 + } 4433 + 4434 + /* 4435 + * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least 4436 + * the given size 4437 + * 4438 + * the sheaf might be replaced by a new one when requesting more than 4439 + * s->sheaf_capacity objects if such replacement is necessary, but the refill 4440 + * fails (returning -ENOMEM), the existing sheaf is left intact 4441 + * 4442 + * In practice we always refill to full sheaf's capacity. 4443 + */ 4444 + int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, 4445 + struct slab_sheaf **sheafp, unsigned int size) 4446 + { 4447 + struct slab_sheaf *sheaf; 4448 + 4449 + /* 4450 + * TODO: do we want to support *sheaf == NULL to be equivalent of 4451 + * kmem_cache_prefill_sheaf() ? 4452 + */ 4453 + if (!sheafp || !(*sheafp)) 4454 + return -EINVAL; 4455 + 4456 + sheaf = *sheafp; 4457 + if (sheaf->size >= size) 4458 + return 0; 4459 + 4460 + if (likely(sheaf->capacity >= size)) { 4461 + if (likely(sheaf->capacity == s->sheaf_capacity)) 4462 + return refill_sheaf(s, sheaf, gfp); 4463 + 4464 + if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size, 4465 + &sheaf->objects[sheaf->size])) { 4466 + return -ENOMEM; 4467 + } 4468 + sheaf->size = sheaf->capacity; 4469 + 4470 + return 0; 4471 + } 4472 + 4473 + /* 4474 + * We had a regular sized sheaf and need an oversize one, or we had an 4475 + * oversize one already but need a larger one now. 4476 + * This should be a very rare path so let's not complicate it. 4477 + */ 4478 + sheaf = kmem_cache_prefill_sheaf(s, gfp, size); 4479 + if (!sheaf) 4480 + return -ENOMEM; 4481 + 4482 + kmem_cache_return_sheaf(s, gfp, *sheafp); 4483 + *sheafp = sheaf; 4484 + return 0; 4485 + } 4486 + 4487 + /* 4488 + * Allocate from a sheaf obtained by kmem_cache_prefill_sheaf() 4489 + * 4490 + * Guaranteed not to fail as many allocations as was the requested size. 4491 + * After the sheaf is emptied, it fails - no fallback to the slab cache itself. 4492 + * 4493 + * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT 4494 + * memcg charging is forced over limit if necessary, to avoid failure. 4495 + */ 4496 + void * 4497 + kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp, 4498 + struct slab_sheaf *sheaf) 4499 + { 4500 + void *ret = NULL; 4501 + bool init; 4502 + 4503 + if (sheaf->size == 0) 4504 + goto out; 4505 + 4506 + ret = sheaf->objects[--sheaf->size]; 4507 + 4508 + init = slab_want_init_on_alloc(gfp, s); 4509 + 4510 + /* add __GFP_NOFAIL to force successful memcg charging */ 4511 + slab_post_alloc_hook(s, NULL, gfp | __GFP_NOFAIL, 1, &ret, init, s->object_size); 4512 + out: 4513 + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfp, NUMA_NO_NODE); 4514 + 4515 + return ret; 4516 + } 4517 + 4518 + unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf) 4519 + { 4520 + return sheaf->size; 4521 + } 5303 4522 /* 5304 4523 * To avoid unnecessary overhead, we pass through large allocation requests 5305 4524 * directly to the page allocator. We use __GFP_COMP, because we will need to ··· 5613 4388 return __do_kmalloc_node(size, NULL, flags, NUMA_NO_NODE, _RET_IP_); 5614 4389 } 5615 4390 EXPORT_SYMBOL(__kmalloc_noprof); 4391 + 4392 + /** 4393 + * kmalloc_nolock - Allocate an object of given size from any context. 4394 + * @size: size to allocate 4395 + * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO, __GFP_NO_OBJ_EXT 4396 + * allowed. 4397 + * @node: node number of the target node. 4398 + * 4399 + * Return: pointer to the new object or NULL in case of error. 4400 + * NULL does not mean EBUSY or EAGAIN. It means ENOMEM. 4401 + * There is no reason to call it again and expect !NULL. 4402 + */ 4403 + void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) 4404 + { 4405 + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags; 4406 + struct kmem_cache *s; 4407 + bool can_retry = true; 4408 + void *ret = ERR_PTR(-EBUSY); 4409 + 4410 + VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO | 4411 + __GFP_NO_OBJ_EXT)); 4412 + 4413 + if (unlikely(!size)) 4414 + return ZERO_SIZE_PTR; 4415 + 4416 + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) 4417 + /* kmalloc_nolock() in PREEMPT_RT is not supported from irq */ 4418 + return NULL; 4419 + retry: 4420 + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) 4421 + return NULL; 4422 + s = kmalloc_slab(size, NULL, alloc_gfp, _RET_IP_); 4423 + 4424 + if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) 4425 + /* 4426 + * kmalloc_nolock() is not supported on architectures that 4427 + * don't implement cmpxchg16b, but debug caches don't use 4428 + * per-cpu slab and per-cpu partial slabs. They rely on 4429 + * kmem_cache_node->list_lock, so kmalloc_nolock() can 4430 + * attempt to allocate from debug caches by 4431 + * spin_trylock_irqsave(&n->list_lock, ...) 4432 + */ 4433 + return NULL; 4434 + 4435 + /* 4436 + * Do not call slab_alloc_node(), since trylock mode isn't 4437 + * compatible with slab_pre_alloc_hook/should_failslab and 4438 + * kfence_alloc. Hence call __slab_alloc_node() (at most twice) 4439 + * and slab_post_alloc_hook() directly. 4440 + * 4441 + * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair 4442 + * in irq saved region. It assumes that the same cpu will not 4443 + * __update_cpu_freelist_fast() into the same (freelist,tid) pair. 4444 + * Therefore use in_nmi() to check whether particular bucket is in 4445 + * irq protected section. 4446 + * 4447 + * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that 4448 + * this cpu was interrupted somewhere inside ___slab_alloc() after 4449 + * it did local_lock_irqsave(&s->cpu_slab->lock, flags). 4450 + * In this case fast path with __update_cpu_freelist_fast() is not safe. 4451 + */ 4452 + #ifndef CONFIG_SLUB_TINY 4453 + if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) 4454 + #endif 4455 + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); 4456 + 4457 + if (PTR_ERR(ret) == -EBUSY) { 4458 + if (can_retry) { 4459 + /* pick the next kmalloc bucket */ 4460 + size = s->object_size + 1; 4461 + /* 4462 + * Another alternative is to 4463 + * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; 4464 + * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; 4465 + * to retry from bucket of the same size. 4466 + */ 4467 + can_retry = false; 4468 + goto retry; 4469 + } 4470 + ret = NULL; 4471 + } 4472 + 4473 + maybe_wipe_obj_freeptr(s, ret); 4474 + slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret, 4475 + slab_want_init_on_alloc(alloc_gfp, s), size); 4476 + 4477 + ret = kasan_kmalloc(s, ret, size, alloc_gfp); 4478 + return ret; 4479 + } 4480 + EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof); 5616 4481 5617 4482 void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, 5618 4483 int node, unsigned long caller) ··· 5921 4606 discard_slab(s, slab); 5922 4607 } 5923 4608 4609 + /* 4610 + * pcs is locked. We should have get rid of the spare sheaf and obtained an 4611 + * empty sheaf, while the main sheaf is full. We want to install the empty sheaf 4612 + * as a main sheaf, and make the current main sheaf a spare sheaf. 4613 + * 4614 + * However due to having relinquished the cpu_sheaves lock when obtaining 4615 + * the empty sheaf, we need to handle some unlikely but possible cases. 4616 + * 4617 + * If we put any sheaf to barn here, it's because we were interrupted or have 4618 + * been migrated to a different cpu, which should be rare enough so just ignore 4619 + * the barn's limits to simplify the handling. 4620 + * 4621 + * An alternative scenario that gets us here is when we fail 4622 + * barn_replace_full_sheaf(), because there's no empty sheaf available in the 4623 + * barn, so we had to allocate it by alloc_empty_sheaf(). But because we saw the 4624 + * limit on full sheaves was not exceeded, we assume it didn't change and just 4625 + * put the full sheaf there. 4626 + */ 4627 + static void __pcs_install_empty_sheaf(struct kmem_cache *s, 4628 + struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty) 4629 + { 4630 + struct node_barn *barn; 4631 + 4632 + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); 4633 + 4634 + /* This is what we expect to find if nobody interrupted us. */ 4635 + if (likely(!pcs->spare)) { 4636 + pcs->spare = pcs->main; 4637 + pcs->main = empty; 4638 + return; 4639 + } 4640 + 4641 + barn = get_barn(s); 4642 + 4643 + /* 4644 + * Unlikely because if the main sheaf had space, we would have just 4645 + * freed to it. Get rid of our empty sheaf. 4646 + */ 4647 + if (pcs->main->size < s->sheaf_capacity) { 4648 + barn_put_empty_sheaf(barn, empty); 4649 + return; 4650 + } 4651 + 4652 + /* Also unlikely for the same reason */ 4653 + if (pcs->spare->size < s->sheaf_capacity) { 4654 + swap(pcs->main, pcs->spare); 4655 + barn_put_empty_sheaf(barn, empty); 4656 + return; 4657 + } 4658 + 4659 + /* 4660 + * We probably failed barn_replace_full_sheaf() due to no empty sheaf 4661 + * available there, but we allocated one, so finish the job. 4662 + */ 4663 + barn_put_full_sheaf(barn, pcs->main); 4664 + stat(s, BARN_PUT); 4665 + pcs->main = empty; 4666 + } 4667 + 4668 + /* 4669 + * Replace the full main sheaf with a (at least partially) empty sheaf. 4670 + * 4671 + * Must be called with the cpu_sheaves local lock locked. If successful, returns 4672 + * the pcs pointer and the local lock locked (possibly on a different cpu than 4673 + * initially called). If not successful, returns NULL and the local lock 4674 + * unlocked. 4675 + */ 4676 + static struct slub_percpu_sheaves * 4677 + __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) 4678 + { 4679 + struct slab_sheaf *empty; 4680 + struct node_barn *barn; 4681 + bool put_fail; 4682 + 4683 + restart: 4684 + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); 4685 + 4686 + barn = get_barn(s); 4687 + put_fail = false; 4688 + 4689 + if (!pcs->spare) { 4690 + empty = barn_get_empty_sheaf(barn); 4691 + if (empty) { 4692 + pcs->spare = pcs->main; 4693 + pcs->main = empty; 4694 + return pcs; 4695 + } 4696 + goto alloc_empty; 4697 + } 4698 + 4699 + if (pcs->spare->size < s->sheaf_capacity) { 4700 + swap(pcs->main, pcs->spare); 4701 + return pcs; 4702 + } 4703 + 4704 + empty = barn_replace_full_sheaf(barn, pcs->main); 4705 + 4706 + if (!IS_ERR(empty)) { 4707 + stat(s, BARN_PUT); 4708 + pcs->main = empty; 4709 + return pcs; 4710 + } 4711 + 4712 + if (PTR_ERR(empty) == -E2BIG) { 4713 + /* Since we got here, spare exists and is full */ 4714 + struct slab_sheaf *to_flush = pcs->spare; 4715 + 4716 + stat(s, BARN_PUT_FAIL); 4717 + 4718 + pcs->spare = NULL; 4719 + local_unlock(&s->cpu_sheaves->lock); 4720 + 4721 + sheaf_flush_unused(s, to_flush); 4722 + empty = to_flush; 4723 + goto got_empty; 4724 + } 4725 + 4726 + /* 4727 + * We could not replace full sheaf because barn had no empty 4728 + * sheaves. We can still allocate it and put the full sheaf in 4729 + * __pcs_install_empty_sheaf(), but if we fail to allocate it, 4730 + * make sure to count the fail. 4731 + */ 4732 + put_fail = true; 4733 + 4734 + alloc_empty: 4735 + local_unlock(&s->cpu_sheaves->lock); 4736 + 4737 + empty = alloc_empty_sheaf(s, GFP_NOWAIT); 4738 + if (empty) 4739 + goto got_empty; 4740 + 4741 + if (put_fail) 4742 + stat(s, BARN_PUT_FAIL); 4743 + 4744 + if (!sheaf_flush_main(s)) 4745 + return NULL; 4746 + 4747 + if (!local_trylock(&s->cpu_sheaves->lock)) 4748 + return NULL; 4749 + 4750 + pcs = this_cpu_ptr(s->cpu_sheaves); 4751 + 4752 + /* 4753 + * we flushed the main sheaf so it should be empty now, 4754 + * but in case we got preempted or migrated, we need to 4755 + * check again 4756 + */ 4757 + if (pcs->main->size == s->sheaf_capacity) 4758 + goto restart; 4759 + 4760 + return pcs; 4761 + 4762 + got_empty: 4763 + if (!local_trylock(&s->cpu_sheaves->lock)) { 4764 + barn_put_empty_sheaf(barn, empty); 4765 + return NULL; 4766 + } 4767 + 4768 + pcs = this_cpu_ptr(s->cpu_sheaves); 4769 + __pcs_install_empty_sheaf(s, pcs, empty); 4770 + 4771 + return pcs; 4772 + } 4773 + 4774 + /* 4775 + * Free an object to the percpu sheaves. 4776 + * The object is expected to have passed slab_free_hook() already. 4777 + */ 4778 + static __fastpath_inline 4779 + bool free_to_pcs(struct kmem_cache *s, void *object) 4780 + { 4781 + struct slub_percpu_sheaves *pcs; 4782 + 4783 + if (!local_trylock(&s->cpu_sheaves->lock)) 4784 + return false; 4785 + 4786 + pcs = this_cpu_ptr(s->cpu_sheaves); 4787 + 4788 + if (unlikely(pcs->main->size == s->sheaf_capacity)) { 4789 + 4790 + pcs = __pcs_replace_full_main(s, pcs); 4791 + if (unlikely(!pcs)) 4792 + return false; 4793 + } 4794 + 4795 + pcs->main->objects[pcs->main->size++] = object; 4796 + 4797 + local_unlock(&s->cpu_sheaves->lock); 4798 + 4799 + stat(s, FREE_PCS); 4800 + 4801 + return true; 4802 + } 4803 + 4804 + static void rcu_free_sheaf(struct rcu_head *head) 4805 + { 4806 + struct slab_sheaf *sheaf; 4807 + struct node_barn *barn; 4808 + struct kmem_cache *s; 4809 + 4810 + sheaf = container_of(head, struct slab_sheaf, rcu_head); 4811 + 4812 + s = sheaf->cache; 4813 + 4814 + /* 4815 + * This may remove some objects due to slab_free_hook() returning false, 4816 + * so that the sheaf might no longer be completely full. But it's easier 4817 + * to handle it as full (unless it became completely empty), as the code 4818 + * handles it fine. The only downside is that sheaf will serve fewer 4819 + * allocations when reused. It only happens due to debugging, which is a 4820 + * performance hit anyway. 4821 + */ 4822 + __rcu_free_sheaf_prepare(s, sheaf); 4823 + 4824 + barn = get_node(s, sheaf->node)->barn; 4825 + 4826 + /* due to slab_free_hook() */ 4827 + if (unlikely(sheaf->size == 0)) 4828 + goto empty; 4829 + 4830 + /* 4831 + * Checking nr_full/nr_empty outside lock avoids contention in case the 4832 + * barn is at the respective limit. Due to the race we might go over the 4833 + * limit but that should be rare and harmless. 4834 + */ 4835 + 4836 + if (data_race(barn->nr_full) < MAX_FULL_SHEAVES) { 4837 + stat(s, BARN_PUT); 4838 + barn_put_full_sheaf(barn, sheaf); 4839 + return; 4840 + } 4841 + 4842 + stat(s, BARN_PUT_FAIL); 4843 + sheaf_flush_unused(s, sheaf); 4844 + 4845 + empty: 4846 + if (data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) { 4847 + barn_put_empty_sheaf(barn, sheaf); 4848 + return; 4849 + } 4850 + 4851 + free_empty_sheaf(s, sheaf); 4852 + } 4853 + 4854 + bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) 4855 + { 4856 + struct slub_percpu_sheaves *pcs; 4857 + struct slab_sheaf *rcu_sheaf; 4858 + 4859 + if (!local_trylock(&s->cpu_sheaves->lock)) 4860 + goto fail; 4861 + 4862 + pcs = this_cpu_ptr(s->cpu_sheaves); 4863 + 4864 + if (unlikely(!pcs->rcu_free)) { 4865 + 4866 + struct slab_sheaf *empty; 4867 + struct node_barn *barn; 4868 + 4869 + if (pcs->spare && pcs->spare->size == 0) { 4870 + pcs->rcu_free = pcs->spare; 4871 + pcs->spare = NULL; 4872 + goto do_free; 4873 + } 4874 + 4875 + barn = get_barn(s); 4876 + 4877 + empty = barn_get_empty_sheaf(barn); 4878 + 4879 + if (empty) { 4880 + pcs->rcu_free = empty; 4881 + goto do_free; 4882 + } 4883 + 4884 + local_unlock(&s->cpu_sheaves->lock); 4885 + 4886 + empty = alloc_empty_sheaf(s, GFP_NOWAIT); 4887 + 4888 + if (!empty) 4889 + goto fail; 4890 + 4891 + if (!local_trylock(&s->cpu_sheaves->lock)) { 4892 + barn_put_empty_sheaf(barn, empty); 4893 + goto fail; 4894 + } 4895 + 4896 + pcs = this_cpu_ptr(s->cpu_sheaves); 4897 + 4898 + if (unlikely(pcs->rcu_free)) 4899 + barn_put_empty_sheaf(barn, empty); 4900 + else 4901 + pcs->rcu_free = empty; 4902 + } 4903 + 4904 + do_free: 4905 + 4906 + rcu_sheaf = pcs->rcu_free; 4907 + 4908 + /* 4909 + * Since we flush immediately when size reaches capacity, we never reach 4910 + * this with size already at capacity, so no OOB write is possible. 4911 + */ 4912 + rcu_sheaf->objects[rcu_sheaf->size++] = obj; 4913 + 4914 + if (likely(rcu_sheaf->size < s->sheaf_capacity)) { 4915 + rcu_sheaf = NULL; 4916 + } else { 4917 + pcs->rcu_free = NULL; 4918 + rcu_sheaf->node = numa_mem_id(); 4919 + } 4920 + 4921 + /* 4922 + * we flush before local_unlock to make sure a racing 4923 + * flush_all_rcu_sheaves() doesn't miss this sheaf 4924 + */ 4925 + if (rcu_sheaf) 4926 + call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf); 4927 + 4928 + local_unlock(&s->cpu_sheaves->lock); 4929 + 4930 + stat(s, FREE_RCU_SHEAF); 4931 + return true; 4932 + 4933 + fail: 4934 + stat(s, FREE_RCU_SHEAF_FAIL); 4935 + return false; 4936 + } 4937 + 4938 + /* 4939 + * Bulk free objects to the percpu sheaves. 4940 + * Unlike free_to_pcs() this includes the calls to all necessary hooks 4941 + * and the fallback to freeing to slab pages. 4942 + */ 4943 + static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) 4944 + { 4945 + struct slub_percpu_sheaves *pcs; 4946 + struct slab_sheaf *main, *empty; 4947 + bool init = slab_want_init_on_free(s); 4948 + unsigned int batch, i = 0; 4949 + struct node_barn *barn; 4950 + void *remote_objects[PCS_BATCH_MAX]; 4951 + unsigned int remote_nr = 0; 4952 + int node = numa_mem_id(); 4953 + 4954 + next_remote_batch: 4955 + while (i < size) { 4956 + struct slab *slab = virt_to_slab(p[i]); 4957 + 4958 + memcg_slab_free_hook(s, slab, p + i, 1); 4959 + alloc_tagging_slab_free_hook(s, slab, p + i, 1); 4960 + 4961 + if (unlikely(!slab_free_hook(s, p[i], init, false))) { 4962 + p[i] = p[--size]; 4963 + if (!size) 4964 + goto flush_remote; 4965 + continue; 4966 + } 4967 + 4968 + if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)) { 4969 + remote_objects[remote_nr] = p[i]; 4970 + p[i] = p[--size]; 4971 + if (++remote_nr >= PCS_BATCH_MAX) 4972 + goto flush_remote; 4973 + continue; 4974 + } 4975 + 4976 + i++; 4977 + } 4978 + 4979 + next_batch: 4980 + if (!local_trylock(&s->cpu_sheaves->lock)) 4981 + goto fallback; 4982 + 4983 + pcs = this_cpu_ptr(s->cpu_sheaves); 4984 + 4985 + if (likely(pcs->main->size < s->sheaf_capacity)) 4986 + goto do_free; 4987 + 4988 + barn = get_barn(s); 4989 + 4990 + if (!pcs->spare) { 4991 + empty = barn_get_empty_sheaf(barn); 4992 + if (!empty) 4993 + goto no_empty; 4994 + 4995 + pcs->spare = pcs->main; 4996 + pcs->main = empty; 4997 + goto do_free; 4998 + } 4999 + 5000 + if (pcs->spare->size < s->sheaf_capacity) { 5001 + swap(pcs->main, pcs->spare); 5002 + goto do_free; 5003 + } 5004 + 5005 + empty = barn_replace_full_sheaf(barn, pcs->main); 5006 + if (IS_ERR(empty)) { 5007 + stat(s, BARN_PUT_FAIL); 5008 + goto no_empty; 5009 + } 5010 + 5011 + stat(s, BARN_PUT); 5012 + pcs->main = empty; 5013 + 5014 + do_free: 5015 + main = pcs->main; 5016 + batch = min(size, s->sheaf_capacity - main->size); 5017 + 5018 + memcpy(main->objects + main->size, p, batch * sizeof(void *)); 5019 + main->size += batch; 5020 + 5021 + local_unlock(&s->cpu_sheaves->lock); 5022 + 5023 + stat_add(s, FREE_PCS, batch); 5024 + 5025 + if (batch < size) { 5026 + p += batch; 5027 + size -= batch; 5028 + goto next_batch; 5029 + } 5030 + 5031 + return; 5032 + 5033 + no_empty: 5034 + local_unlock(&s->cpu_sheaves->lock); 5035 + 5036 + /* 5037 + * if we depleted all empty sheaves in the barn or there are too 5038 + * many full sheaves, free the rest to slab pages 5039 + */ 5040 + fallback: 5041 + __kmem_cache_free_bulk(s, size, p); 5042 + 5043 + flush_remote: 5044 + if (remote_nr) { 5045 + __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]); 5046 + if (i < size) { 5047 + remote_nr = 0; 5048 + goto next_remote_batch; 5049 + } 5050 + } 5051 + } 5052 + 5053 + struct defer_free { 5054 + struct llist_head objects; 5055 + struct llist_head slabs; 5056 + struct irq_work work; 5057 + }; 5058 + 5059 + static void free_deferred_objects(struct irq_work *work); 5060 + 5061 + static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = { 5062 + .objects = LLIST_HEAD_INIT(objects), 5063 + .slabs = LLIST_HEAD_INIT(slabs), 5064 + .work = IRQ_WORK_INIT(free_deferred_objects), 5065 + }; 5066 + 5067 + /* 5068 + * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe 5069 + * to take sleeping spin_locks from __slab_free() and deactivate_slab(). 5070 + * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore(). 5071 + */ 5072 + static void free_deferred_objects(struct irq_work *work) 5073 + { 5074 + struct defer_free *df = container_of(work, struct defer_free, work); 5075 + struct llist_head *objs = &df->objects; 5076 + struct llist_head *slabs = &df->slabs; 5077 + struct llist_node *llnode, *pos, *t; 5078 + 5079 + if (llist_empty(objs) && llist_empty(slabs)) 5080 + return; 5081 + 5082 + llnode = llist_del_all(objs); 5083 + llist_for_each_safe(pos, t, llnode) { 5084 + struct kmem_cache *s; 5085 + struct slab *slab; 5086 + void *x = pos; 5087 + 5088 + slab = virt_to_slab(x); 5089 + s = slab->slab_cache; 5090 + 5091 + /* 5092 + * We used freepointer in 'x' to link 'x' into df->objects. 5093 + * Clear it to NULL to avoid false positive detection 5094 + * of "Freepointer corruption". 5095 + */ 5096 + *(void **)x = NULL; 5097 + 5098 + /* Point 'x' back to the beginning of allocated object */ 5099 + x -= s->offset; 5100 + __slab_free(s, slab, x, x, 1, _THIS_IP_); 5101 + } 5102 + 5103 + llnode = llist_del_all(slabs); 5104 + llist_for_each_safe(pos, t, llnode) { 5105 + struct slab *slab = container_of(pos, struct slab, llnode); 5106 + 5107 + #ifdef CONFIG_SLUB_TINY 5108 + discard_slab(slab->slab_cache, slab); 5109 + #else 5110 + deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); 5111 + #endif 5112 + } 5113 + } 5114 + 5115 + static void defer_free(struct kmem_cache *s, void *head) 5116 + { 5117 + struct defer_free *df = this_cpu_ptr(&defer_free_objects); 5118 + 5119 + if (llist_add(head + s->offset, &df->objects)) 5120 + irq_work_queue(&df->work); 5121 + } 5122 + 5123 + static void defer_deactivate_slab(struct slab *slab, void *flush_freelist) 5124 + { 5125 + struct defer_free *df = this_cpu_ptr(&defer_free_objects); 5126 + 5127 + slab->flush_freelist = flush_freelist; 5128 + if (llist_add(&slab->llnode, &df->slabs)) 5129 + irq_work_queue(&df->work); 5130 + } 5131 + 5132 + void defer_free_barrier(void) 5133 + { 5134 + int cpu; 5135 + 5136 + for_each_possible_cpu(cpu) 5137 + irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); 5138 + } 5139 + 5924 5140 #ifndef CONFIG_SLUB_TINY 5925 5141 /* 5926 5142 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that ··· 6472 4626 struct slab *slab, void *head, void *tail, 6473 4627 int cnt, unsigned long addr) 6474 4628 { 4629 + /* cnt == 0 signals that it's called from kfree_nolock() */ 4630 + bool allow_spin = cnt; 6475 4631 struct kmem_cache_cpu *c; 6476 4632 unsigned long tid; 6477 4633 void **freelist; ··· 6492 4644 barrier(); 6493 4645 6494 4646 if (unlikely(slab != c->slab)) { 6495 - __slab_free(s, slab, head, tail, cnt, addr); 4647 + if (unlikely(!allow_spin)) { 4648 + /* 4649 + * __slab_free() can locklessly cmpxchg16 into a slab, 4650 + * but then it might need to take spin_lock or local_lock 4651 + * in put_cpu_partial() for further processing. 4652 + * Avoid the complexity and simply add to a deferred list. 4653 + */ 4654 + defer_free(s, head); 4655 + } else { 4656 + __slab_free(s, slab, head, tail, cnt, addr); 4657 + } 6496 4658 return; 4659 + } 4660 + 4661 + if (unlikely(!allow_spin)) { 4662 + if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) && 4663 + local_lock_is_locked(&s->cpu_slab->lock)) { 4664 + defer_free(s, head); 4665 + return; 4666 + } 4667 + cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */ 6497 4668 } 6498 4669 6499 4670 if (USE_LOCKLESS_FAST_PATH()) { ··· 6525 4658 goto redo; 6526 4659 } 6527 4660 } else { 4661 + __maybe_unused unsigned long flags = 0; 4662 + 6528 4663 /* Update the free list under the local lock */ 6529 - local_lock(&s->cpu_slab->lock); 4664 + local_lock_cpu_slab(s, flags); 6530 4665 c = this_cpu_ptr(s->cpu_slab); 6531 4666 if (unlikely(slab != c->slab)) { 6532 - local_unlock(&s->cpu_slab->lock); 4667 + local_unlock_cpu_slab(s, flags); 6533 4668 goto redo; 6534 4669 } 6535 4670 tid = c->tid; ··· 6541 4672 c->freelist = head; 6542 4673 c->tid = next_tid(tid); 6543 4674 6544 - local_unlock(&s->cpu_slab->lock); 4675 + local_unlock_cpu_slab(s, flags); 6545 4676 } 6546 4677 stat_add(s, FREE_FASTPATH, cnt); 6547 4678 } ··· 6561 4692 memcg_slab_free_hook(s, slab, &object, 1); 6562 4693 alloc_tagging_slab_free_hook(s, slab, &object, 1); 6563 4694 6564 - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) 6565 - do_slab_free(s, slab, object, object, 1, addr); 4695 + if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) 4696 + return; 4697 + 4698 + if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || 4699 + slab_nid(slab) == numa_mem_id())) { 4700 + if (likely(free_to_pcs(s, object))) 4701 + return; 4702 + } 4703 + 4704 + do_slab_free(s, slab, object, object, 1, addr); 6566 4705 } 6567 4706 6568 4707 #ifdef CONFIG_MEMCG ··· 6771 4894 slab_free(s, slab, x, _RET_IP_); 6772 4895 } 6773 4896 EXPORT_SYMBOL(kfree); 4897 + 4898 + /* 4899 + * Can be called while holding raw_spinlock_t or from IRQ and NMI, 4900 + * but ONLY for objects allocated by kmalloc_nolock(). 4901 + * Debug checks (like kmemleak and kfence) were skipped on allocation, 4902 + * hence 4903 + * obj = kmalloc(); kfree_nolock(obj); 4904 + * will miss kmemleak/kfence book keeping and will cause false positives. 4905 + * large_kmalloc is not supported either. 4906 + */ 4907 + void kfree_nolock(const void *object) 4908 + { 4909 + struct folio *folio; 4910 + struct slab *slab; 4911 + struct kmem_cache *s; 4912 + void *x = (void *)object; 4913 + 4914 + if (unlikely(ZERO_OR_NULL_PTR(object))) 4915 + return; 4916 + 4917 + folio = virt_to_folio(object); 4918 + if (unlikely(!folio_test_slab(folio))) { 4919 + WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()"); 4920 + return; 4921 + } 4922 + 4923 + slab = folio_slab(folio); 4924 + s = slab->slab_cache; 4925 + 4926 + memcg_slab_free_hook(s, slab, &x, 1); 4927 + alloc_tagging_slab_free_hook(s, slab, &x, 1); 4928 + /* 4929 + * Unlike slab_free() do NOT call the following: 4930 + * kmemleak_free_recursive(x, s->flags); 4931 + * debug_check_no_locks_freed(x, s->object_size); 4932 + * debug_check_no_obj_freed(x, s->object_size); 4933 + * __kcsan_check_access(x, s->object_size, ..); 4934 + * kfence_free(x); 4935 + * since they take spinlocks or not safe from any context. 4936 + */ 4937 + kmsan_slab_free(s, x); 4938 + /* 4939 + * If KASAN finds a kernel bug it will do kasan_report_invalid_free() 4940 + * which will call raw_spin_lock_irqsave() which is technically 4941 + * unsafe from NMI, but take chance and report kernel bug. 4942 + * The sequence of 4943 + * kasan_report_invalid_free() -> raw_spin_lock_irqsave() -> NMI 4944 + * -> kfree_nolock() -> kasan_report_invalid_free() on the same CPU 4945 + * is double buggy and deserves to deadlock. 4946 + */ 4947 + if (kasan_slab_pre_free(s, x)) 4948 + return; 4949 + /* 4950 + * memcg, kasan_slab_pre_free are done for 'x'. 4951 + * The only thing left is kasan_poison without quarantine, 4952 + * since kasan quarantine takes locks and not supported from NMI. 4953 + */ 4954 + kasan_slab_free(s, x, false, false, /* skip quarantine */true); 4955 + #ifndef CONFIG_SLUB_TINY 4956 + do_slab_free(s, slab, x, x, 0, _RET_IP_); 4957 + #else 4958 + defer_free(s, x); 4959 + #endif 4960 + } 4961 + EXPORT_SYMBOL_GPL(kfree_nolock); 6774 4962 6775 4963 static __always_inline __realloc_size(2) void * 6776 4964 __do_krealloc(const void *p, size_t new_size, gfp_t flags) ··· 7230 5288 if (!size) 7231 5289 return; 7232 5290 5291 + /* 5292 + * freeing to sheaves is so incompatible with the detached freelist so 5293 + * once we go that way, we have to do everything differently 5294 + */ 5295 + if (s && s->cpu_sheaves) { 5296 + free_to_pcs_bulk(s, size, p); 5297 + return; 5298 + } 5299 + 7233 5300 do { 7234 5301 struct detached_freelist df; 7235 5302 ··· 7357 5406 int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, 7358 5407 void **p) 7359 5408 { 7360 - int i; 5409 + unsigned int i = 0; 7361 5410 7362 5411 if (!size) 7363 5412 return 0; ··· 7366 5415 if (unlikely(!s)) 7367 5416 return 0; 7368 5417 7369 - i = __kmem_cache_alloc_bulk(s, flags, size, p); 7370 - if (unlikely(i == 0)) 7371 - return 0; 5418 + if (s->cpu_sheaves) 5419 + i = alloc_from_pcs_bulk(s, size, p); 5420 + 5421 + if (i < size) { 5422 + /* 5423 + * If we ran out of memory, don't bother with freeing back to 5424 + * the percpu sheaves, we have bigger problems. 5425 + */ 5426 + if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) { 5427 + if (i > 0) 5428 + __kmem_cache_free_bulk(s, i, p); 5429 + return 0; 5430 + } 5431 + } 7372 5432 7373 5433 /* 7374 5434 * memcg and kmem_cache debug support and memory initialization. ··· 7389 5427 slab_want_init_on_alloc(flags, s), s->object_size))) { 7390 5428 return 0; 7391 5429 } 7392 - return i; 5430 + 5431 + return size; 7393 5432 } 7394 5433 EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof); 7395 - 7396 5434 7397 5435 /* 7398 5436 * Object placement in a slab is made very easy because we always start at ··· 7527 5565 } 7528 5566 7529 5567 static void 7530 - init_kmem_cache_node(struct kmem_cache_node *n) 5568 + init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn) 7531 5569 { 7532 5570 n->nr_partial = 0; 7533 5571 spin_lock_init(&n->list_lock); ··· 7537 5575 atomic_long_set(&n->total_objects, 0); 7538 5576 INIT_LIST_HEAD(&n->full); 7539 5577 #endif 5578 + n->barn = barn; 5579 + if (barn) 5580 + barn_init(barn); 7540 5581 } 7541 5582 7542 5583 #ifndef CONFIG_SLUB_TINY ··· 7569 5604 return 1; 7570 5605 } 7571 5606 #endif /* CONFIG_SLUB_TINY */ 5607 + 5608 + static int init_percpu_sheaves(struct kmem_cache *s) 5609 + { 5610 + int cpu; 5611 + 5612 + for_each_possible_cpu(cpu) { 5613 + struct slub_percpu_sheaves *pcs; 5614 + 5615 + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); 5616 + 5617 + local_trylock_init(&pcs->lock); 5618 + 5619 + pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); 5620 + 5621 + if (!pcs->main) 5622 + return -ENOMEM; 5623 + } 5624 + 5625 + return 0; 5626 + } 7572 5627 7573 5628 static struct kmem_cache *kmem_cache_node; 7574 5629 ··· 7625 5640 slab->freelist = get_freepointer(kmem_cache_node, n); 7626 5641 slab->inuse = 1; 7627 5642 kmem_cache_node->node[node] = n; 7628 - init_kmem_cache_node(n); 5643 + init_kmem_cache_node(n, NULL); 7629 5644 inc_slabs_node(kmem_cache_node, node, slab->objects); 7630 5645 7631 5646 /* ··· 7641 5656 struct kmem_cache_node *n; 7642 5657 7643 5658 for_each_kmem_cache_node(s, node, n) { 5659 + if (n->barn) { 5660 + WARN_ON(n->barn->nr_full); 5661 + WARN_ON(n->barn->nr_empty); 5662 + kfree(n->barn); 5663 + n->barn = NULL; 5664 + } 5665 + 7644 5666 s->node[node] = NULL; 7645 5667 kmem_cache_free(kmem_cache_node, n); 7646 5668 } ··· 7656 5664 void __kmem_cache_release(struct kmem_cache *s) 7657 5665 { 7658 5666 cache_random_seq_destroy(s); 5667 + if (s->cpu_sheaves) 5668 + pcs_destroy(s); 7659 5669 #ifndef CONFIG_SLUB_TINY 5670 + #ifdef CONFIG_PREEMPT_RT 5671 + lockdep_unregister_key(&s->lock_key); 5672 + #endif 7660 5673 free_percpu(s->cpu_slab); 7661 5674 #endif 7662 5675 free_kmem_cache_nodes(s); ··· 7673 5676 7674 5677 for_each_node_mask(node, slab_nodes) { 7675 5678 struct kmem_cache_node *n; 5679 + struct node_barn *barn = NULL; 7676 5680 7677 5681 if (slab_state == DOWN) { 7678 5682 early_kmem_cache_node_alloc(node); 7679 5683 continue; 7680 5684 } 5685 + 5686 + if (s->cpu_sheaves) { 5687 + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); 5688 + 5689 + if (!barn) 5690 + return 0; 5691 + } 5692 + 7681 5693 n = kmem_cache_alloc_node(kmem_cache_node, 7682 5694 GFP_KERNEL, node); 7683 - 7684 5695 if (!n) { 7685 - free_kmem_cache_nodes(s); 5696 + kfree(barn); 7686 5697 return 0; 7687 5698 } 7688 5699 7689 - init_kmem_cache_node(n); 5700 + init_kmem_cache_node(n, barn); 5701 + 7690 5702 s->node[node] = n; 7691 5703 } 7692 5704 return 1; ··· 7950 5944 struct kmem_cache_node *n; 7951 5945 7952 5946 flush_all_cpus_locked(s); 5947 + 5948 + /* we might have rcu sheaves in flight */ 5949 + if (s->cpu_sheaves) 5950 + rcu_barrier(); 5951 + 7953 5952 /* Attempt to free all objects */ 7954 5953 for_each_kmem_cache_node(s, node, n) { 5954 + if (n->barn) 5955 + barn_shrink(s, n->barn); 7955 5956 free_partial(s, n); 7956 5957 if (n->nr_partial || node_nr_slabs(n)) 7957 5958 return 1; ··· 8162 6149 for (i = 0; i < SHRINK_PROMOTE_MAX; i++) 8163 6150 INIT_LIST_HEAD(promote + i); 8164 6151 6152 + if (n->barn) 6153 + barn_shrink(s, n->barn); 6154 + 8165 6155 spin_lock_irqsave(&n->list_lock, flags); 8166 6156 8167 6157 /* ··· 8244 6228 */ 8245 6229 mutex_lock(&slab_mutex); 8246 6230 list_for_each_entry(s, &slab_caches, list) { 6231 + struct node_barn *barn = NULL; 6232 + 8247 6233 /* 8248 6234 * The structure may already exist if the node was previously 8249 6235 * onlined and offlined. 8250 6236 */ 8251 6237 if (get_node(s, nid)) 8252 6238 continue; 6239 + 6240 + if (s->cpu_sheaves) { 6241 + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); 6242 + 6243 + if (!barn) { 6244 + ret = -ENOMEM; 6245 + goto out; 6246 + } 6247 + } 6248 + 8253 6249 /* 8254 6250 * XXX: kmem_cache_alloc_node will fallback to other nodes 8255 6251 * since memory is not yet available from the node that ··· 8269 6241 */ 8270 6242 n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); 8271 6243 if (!n) { 6244 + kfree(barn); 8272 6245 ret = -ENOMEM; 8273 6246 goto out; 8274 6247 } 8275 - init_kmem_cache_node(n); 6248 + 6249 + init_kmem_cache_node(n, barn); 6250 + 8276 6251 s->node[nid] = n; 8277 6252 } 8278 6253 /* ··· 8488 6457 8489 6458 set_cpu_partial(s); 8490 6459 6460 + if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY) 6461 + && !(s->flags & SLAB_DEBUG_FLAGS)) { 6462 + s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); 6463 + if (!s->cpu_sheaves) { 6464 + err = -ENOMEM; 6465 + goto out; 6466 + } 6467 + // TODO: increase capacity to grow slab_sheaf up to next kmalloc size? 6468 + s->sheaf_capacity = args->sheaf_capacity; 6469 + } 6470 + 8491 6471 #ifdef CONFIG_NUMA 8492 6472 s->remote_node_defrag_ratio = 1000; 8493 6473 #endif ··· 8514 6472 8515 6473 if (!alloc_kmem_cache_cpus(s)) 8516 6474 goto out; 6475 + 6476 + if (s->cpu_sheaves) { 6477 + err = init_percpu_sheaves(s); 6478 + if (err) 6479 + goto out; 6480 + } 8517 6481 8518 6482 err = 0; 8519 6483 ··· 8561 6513 { 8562 6514 void *p; 8563 6515 void *addr = slab_address(slab); 6516 + 6517 + if (!validate_slab_ptr(slab)) { 6518 + slab_err(s, slab, "Not a valid slab page"); 6519 + return; 6520 + } 8564 6521 8565 6522 if (!check_slab(s, slab) || !on_freelist(s, slab, NULL)) 8566 6523 return; ··· 8978 6925 } 8979 6926 SLAB_ATTR_RO(order); 8980 6927 6928 + static ssize_t sheaf_capacity_show(struct kmem_cache *s, char *buf) 6929 + { 6930 + return sysfs_emit(buf, "%u\n", s->sheaf_capacity); 6931 + } 6932 + SLAB_ATTR_RO(sheaf_capacity); 6933 + 8981 6934 static ssize_t min_partial_show(struct kmem_cache *s, char *buf) 8982 6935 { 8983 6936 return sysfs_emit(buf, "%lu\n", s->min_partial); ··· 9331 7272 } \ 9332 7273 SLAB_ATTR(text); \ 9333 7274 7275 + STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf); 9334 7276 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); 9335 7277 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); 7278 + STAT_ATTR(FREE_PCS, free_cpu_sheaf); 7279 + STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf); 7280 + STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail); 9336 7281 STAT_ATTR(FREE_FASTPATH, free_fastpath); 9337 7282 STAT_ATTR(FREE_SLOWPATH, free_slowpath); 9338 7283 STAT_ATTR(FREE_FROZEN, free_frozen); ··· 9361 7298 STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); 9362 7299 STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); 9363 7300 STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); 7301 + STAT_ATTR(SHEAF_FLUSH, sheaf_flush); 7302 + STAT_ATTR(SHEAF_REFILL, sheaf_refill); 7303 + STAT_ATTR(SHEAF_ALLOC, sheaf_alloc); 7304 + STAT_ATTR(SHEAF_FREE, sheaf_free); 7305 + STAT_ATTR(BARN_GET, barn_get); 7306 + STAT_ATTR(BARN_GET_FAIL, barn_get_fail); 7307 + STAT_ATTR(BARN_PUT, barn_put); 7308 + STAT_ATTR(BARN_PUT_FAIL, barn_put_fail); 7309 + STAT_ATTR(SHEAF_PREFILL_FAST, sheaf_prefill_fast); 7310 + STAT_ATTR(SHEAF_PREFILL_SLOW, sheaf_prefill_slow); 7311 + STAT_ATTR(SHEAF_PREFILL_OVERSIZE, sheaf_prefill_oversize); 7312 + STAT_ATTR(SHEAF_RETURN_FAST, sheaf_return_fast); 7313 + STAT_ATTR(SHEAF_RETURN_SLOW, sheaf_return_slow); 9364 7314 #endif /* CONFIG_SLUB_STATS */ 9365 7315 9366 7316 #ifdef CONFIG_KFENCE ··· 9404 7328 &object_size_attr.attr, 9405 7329 &objs_per_slab_attr.attr, 9406 7330 &order_attr.attr, 7331 + &sheaf_capacity_attr.attr, 9407 7332 &min_partial_attr.attr, 9408 7333 &cpu_partial_attr.attr, 9409 7334 &objects_partial_attr.attr, ··· 9436 7359 &remote_node_defrag_ratio_attr.attr, 9437 7360 #endif 9438 7361 #ifdef CONFIG_SLUB_STATS 7362 + &alloc_cpu_sheaf_attr.attr, 9439 7363 &alloc_fastpath_attr.attr, 9440 7364 &alloc_slowpath_attr.attr, 7365 + &free_cpu_sheaf_attr.attr, 7366 + &free_rcu_sheaf_attr.attr, 7367 + &free_rcu_sheaf_fail_attr.attr, 9441 7368 &free_fastpath_attr.attr, 9442 7369 &free_slowpath_attr.attr, 9443 7370 &free_frozen_attr.attr, ··· 9466 7385 &cpu_partial_free_attr.attr, 9467 7386 &cpu_partial_node_attr.attr, 9468 7387 &cpu_partial_drain_attr.attr, 7388 + &sheaf_flush_attr.attr, 7389 + &sheaf_refill_attr.attr, 7390 + &sheaf_alloc_attr.attr, 7391 + &sheaf_free_attr.attr, 7392 + &barn_get_attr.attr, 7393 + &barn_get_fail_attr.attr, 7394 + &barn_put_attr.attr, 7395 + &barn_put_fail_attr.attr, 7396 + &sheaf_prefill_fast_attr.attr, 7397 + &sheaf_prefill_slow_attr.attr, 7398 + &sheaf_prefill_oversize_attr.attr, 7399 + &sheaf_return_fast_attr.attr, 7400 + &sheaf_return_slow_attr.attr, 9469 7401 #endif 9470 7402 #ifdef CONFIG_FAILSLAB 9471 7403 &failslab_attr.attr, ··· 9820 7726 return NULL; 9821 7727 } 9822 7728 9823 - static int cmp_loc_by_count(const void *a, const void *b, const void *data) 7729 + static int cmp_loc_by_count(const void *a, const void *b) 9824 7730 { 9825 7731 struct location *loc1 = (struct location *)a; 9826 7732 struct location *loc2 = (struct location *)b; 9827 7733 9828 - if (loc1->count > loc2->count) 9829 - return -1; 9830 - else 9831 - return 1; 7734 + return cmp_int(loc2->count, loc1->count); 9832 7735 } 9833 7736 9834 7737 static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos) ··· 9887 7796 } 9888 7797 9889 7798 /* Sort locations by count */ 9890 - sort_r(t->loc, t->count, sizeof(struct location), 9891 - cmp_loc_by_count, NULL, NULL); 7799 + sort(t->loc, t->count, sizeof(struct location), 7800 + cmp_loc_by_count, NULL); 9892 7801 9893 7802 bitmap_free(obj_map); 9894 7803 return 0;
+1
mm/vma_init.c
··· 16 16 struct kmem_cache_args args = { 17 17 .use_freeptr_offset = true, 18 18 .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), 19 + .sheaf_capacity = 32, 19 20 }; 20 21 21 22 vm_area_cachep = kmem_cache_create("vm_area_struct",
+161 -4
tools/include/linux/slab.h
··· 4 4 5 5 #include <linux/types.h> 6 6 #include <linux/gfp.h> 7 + #include <pthread.h> 7 8 8 - #define SLAB_PANIC 2 9 9 #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ 10 10 11 11 #define kzalloc_node(size, flags, node) kmalloc(size, flags) 12 + enum _slab_flag_bits { 13 + _SLAB_KMALLOC, 14 + _SLAB_HWCACHE_ALIGN, 15 + _SLAB_PANIC, 16 + _SLAB_TYPESAFE_BY_RCU, 17 + _SLAB_ACCOUNT, 18 + _SLAB_FLAGS_LAST_BIT 19 + }; 20 + 21 + #define __SLAB_FLAG_BIT(nr) ((unsigned int __force)(1U << (nr))) 22 + #define __SLAB_FLAG_UNUSED ((unsigned int __force)(0U)) 23 + 24 + #define SLAB_HWCACHE_ALIGN __SLAB_FLAG_BIT(_SLAB_HWCACHE_ALIGN) 25 + #define SLAB_PANIC __SLAB_FLAG_BIT(_SLAB_PANIC) 26 + #define SLAB_TYPESAFE_BY_RCU __SLAB_FLAG_BIT(_SLAB_TYPESAFE_BY_RCU) 27 + #ifdef CONFIG_MEMCG 28 + # define SLAB_ACCOUNT __SLAB_FLAG_BIT(_SLAB_ACCOUNT) 29 + #else 30 + # define SLAB_ACCOUNT __SLAB_FLAG_UNUSED 31 + #endif 12 32 13 33 void *kmalloc(size_t size, gfp_t gfp); 14 34 void kfree(void *p); ··· 41 21 PARTIAL, 42 22 UP, 43 23 FULL 24 + }; 25 + 26 + struct kmem_cache { 27 + pthread_mutex_t lock; 28 + unsigned int size; 29 + unsigned int align; 30 + unsigned int sheaf_capacity; 31 + int nr_objs; 32 + void *objs; 33 + void (*ctor)(void *); 34 + bool non_kernel_enabled; 35 + unsigned int non_kernel; 36 + unsigned long nr_allocated; 37 + unsigned long nr_tallocated; 38 + bool exec_callback; 39 + void (*callback)(void *); 40 + void *private; 41 + }; 42 + 43 + struct kmem_cache_args { 44 + /** 45 + * @align: The required alignment for the objects. 46 + * 47 + * %0 means no specific alignment is requested. 48 + */ 49 + unsigned int align; 50 + /** 51 + * @sheaf_capacity: The maximum size of the sheaf. 52 + */ 53 + unsigned int sheaf_capacity; 54 + /** 55 + * @useroffset: Usercopy region offset. 56 + * 57 + * %0 is a valid offset, when @usersize is non-%0 58 + */ 59 + unsigned int useroffset; 60 + /** 61 + * @usersize: Usercopy region size. 62 + * 63 + * %0 means no usercopy region is specified. 64 + */ 65 + unsigned int usersize; 66 + /** 67 + * @freeptr_offset: Custom offset for the free pointer 68 + * in &SLAB_TYPESAFE_BY_RCU caches 69 + * 70 + * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer 71 + * outside of the object. This might cause the object to grow in size. 72 + * Cache creators that have a reason to avoid this can specify a custom 73 + * free pointer offset in their struct where the free pointer will be 74 + * placed. 75 + * 76 + * Note that placing the free pointer inside the object requires the 77 + * caller to ensure that no fields are invalidated that are required to 78 + * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for 79 + * details). 80 + * 81 + * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset 82 + * is specified, %use_freeptr_offset must be set %true. 83 + * 84 + * Note that @ctor currently isn't supported with custom free pointers 85 + * as a @ctor requires an external free pointer. 86 + */ 87 + unsigned int freeptr_offset; 88 + /** 89 + * @use_freeptr_offset: Whether a @freeptr_offset is used. 90 + */ 91 + bool use_freeptr_offset; 92 + /** 93 + * @ctor: A constructor for the objects. 94 + * 95 + * The constructor is invoked for each object in a newly allocated slab 96 + * page. It is the cache user's responsibility to free object in the 97 + * same state as after calling the constructor, or deal appropriately 98 + * with any differences between a freshly constructed and a reallocated 99 + * object. 100 + * 101 + * %NULL means no constructor. 102 + */ 103 + void (*ctor)(void *); 104 + }; 105 + 106 + struct slab_sheaf { 107 + union { 108 + struct list_head barn_list; 109 + /* only used for prefilled sheafs */ 110 + unsigned int capacity; 111 + }; 112 + struct kmem_cache *cache; 113 + unsigned int size; 114 + int node; /* only used for rcu_sheaf */ 115 + void *objects[]; 44 116 }; 45 117 46 118 static inline void *kzalloc(size_t size, gfp_t gfp) ··· 149 37 } 150 38 void kmem_cache_free(struct kmem_cache *cachep, void *objp); 151 39 152 - struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, 153 - unsigned int align, unsigned int flags, 154 - void (*ctor)(void *)); 40 + 41 + struct kmem_cache * 42 + __kmem_cache_create_args(const char *name, unsigned int size, 43 + struct kmem_cache_args *args, unsigned int flags); 44 + 45 + /* If NULL is passed for @args, use this variant with default arguments. */ 46 + static inline struct kmem_cache * 47 + __kmem_cache_default_args(const char *name, unsigned int size, 48 + struct kmem_cache_args *args, unsigned int flags) 49 + { 50 + struct kmem_cache_args kmem_default_args = {}; 51 + 52 + return __kmem_cache_create_args(name, size, &kmem_default_args, flags); 53 + } 54 + 55 + static inline struct kmem_cache * 56 + __kmem_cache_create(const char *name, unsigned int size, unsigned int align, 57 + unsigned int flags, void (*ctor)(void *)) 58 + { 59 + struct kmem_cache_args kmem_args = { 60 + .align = align, 61 + .ctor = ctor, 62 + }; 63 + 64 + return __kmem_cache_create_args(name, size, &kmem_args, flags); 65 + } 66 + 67 + #define kmem_cache_create(__name, __object_size, __args, ...) \ 68 + _Generic((__args), \ 69 + struct kmem_cache_args *: __kmem_cache_create_args, \ 70 + void *: __kmem_cache_default_args, \ 71 + default: __kmem_cache_create)(__name, __object_size, __args, __VA_ARGS__) 155 72 156 73 void kmem_cache_free_bulk(struct kmem_cache *cachep, size_t size, void **list); 157 74 int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, 158 75 void **list); 76 + struct slab_sheaf * 77 + kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size); 78 + 79 + void * 80 + kmem_cache_alloc_from_sheaf(struct kmem_cache *s, gfp_t gfp, 81 + struct slab_sheaf *sheaf); 82 + 83 + void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, 84 + struct slab_sheaf *sheaf); 85 + int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, 86 + struct slab_sheaf **sheafp, unsigned int size); 87 + 88 + static inline unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf) 89 + { 90 + return sheaf->size; 91 + } 159 92 160 93 #endif /* _TOOLS_SLAB_H */
+26 -488
tools/testing/radix-tree/maple.c
··· 8 8 * difficult to handle in kernel tests. 9 9 */ 10 10 11 - #define CONFIG_DEBUG_MAPLE_TREE 12 - #define CONFIG_MAPLE_SEARCH 13 - #define MAPLE_32BIT (MAPLE_NODE_SLOTS > 31) 14 - #include "test.h" 15 - #include <stdlib.h> 16 - #include <time.h> 17 - #include <linux/init.h> 18 - 19 11 #define module_init(x) 20 12 #define module_exit(x) 21 13 #define MODULE_AUTHOR(x) ··· 15 23 #define MODULE_LICENSE(x) 16 24 #define dump_stack() assert(0) 17 25 18 - #include "../../../lib/maple_tree.c" 26 + #include "test.h" 27 + 28 + #include "../shared/maple-shim.c" 19 29 #include "../../../lib/test_maple_tree.c" 20 30 21 31 #define RCU_RANGE_COUNT 1000 ··· 56 62 int next; 57 63 struct rcu_test_struct2 *test; 58 64 }; 59 - 60 - static int get_alloc_node_count(struct ma_state *mas) 61 - { 62 - int count = 1; 63 - struct maple_alloc *node = mas->alloc; 64 - 65 - if (!node || ((unsigned long)node & 0x1)) 66 - return 0; 67 - while (node->node_count) { 68 - count += node->node_count; 69 - node = node->slot[0]; 70 - } 71 - return count; 72 - } 73 - 74 - static void check_mas_alloc_node_count(struct ma_state *mas) 75 - { 76 - mas_node_count_gfp(mas, MAPLE_ALLOC_SLOTS + 1, GFP_KERNEL); 77 - mas_node_count_gfp(mas, MAPLE_ALLOC_SLOTS + 3, GFP_KERNEL); 78 - MT_BUG_ON(mas->tree, get_alloc_node_count(mas) != mas->alloc->total); 79 - mas_destroy(mas); 80 - } 81 - 82 - /* 83 - * check_new_node() - Check the creation of new nodes and error path 84 - * verification. 85 - */ 86 - static noinline void __init check_new_node(struct maple_tree *mt) 87 - { 88 - 89 - struct maple_node *mn, *mn2, *mn3; 90 - struct maple_alloc *smn; 91 - struct maple_node *nodes[100]; 92 - int i, j, total; 93 - 94 - MA_STATE(mas, mt, 0, 0); 95 - 96 - check_mas_alloc_node_count(&mas); 97 - 98 - /* Try allocating 3 nodes */ 99 - mtree_lock(mt); 100 - mt_set_non_kernel(0); 101 - /* request 3 nodes to be allocated. */ 102 - mas_node_count(&mas, 3); 103 - /* Allocation request of 3. */ 104 - MT_BUG_ON(mt, mas_alloc_req(&mas) != 3); 105 - /* Allocate failed. */ 106 - MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); 107 - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); 108 - 109 - MT_BUG_ON(mt, mas_allocated(&mas) != 3); 110 - mn = mas_pop_node(&mas); 111 - MT_BUG_ON(mt, not_empty(mn)); 112 - MT_BUG_ON(mt, mn == NULL); 113 - MT_BUG_ON(mt, mas.alloc == NULL); 114 - MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); 115 - mas_push_node(&mas, mn); 116 - mas_reset(&mas); 117 - mas_destroy(&mas); 118 - mtree_unlock(mt); 119 - 120 - 121 - /* Try allocating 1 node, then 2 more */ 122 - mtree_lock(mt); 123 - /* Set allocation request to 1. */ 124 - mas_set_alloc_req(&mas, 1); 125 - /* Check Allocation request of 1. */ 126 - MT_BUG_ON(mt, mas_alloc_req(&mas) != 1); 127 - mas_set_err(&mas, -ENOMEM); 128 - /* Validate allocation request. */ 129 - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); 130 - /* Eat the requested node. */ 131 - mn = mas_pop_node(&mas); 132 - MT_BUG_ON(mt, not_empty(mn)); 133 - MT_BUG_ON(mt, mn == NULL); 134 - MT_BUG_ON(mt, mn->slot[0] != NULL); 135 - MT_BUG_ON(mt, mn->slot[1] != NULL); 136 - MT_BUG_ON(mt, mas_allocated(&mas) != 0); 137 - 138 - mn->parent = ma_parent_ptr(mn); 139 - ma_free_rcu(mn); 140 - mas.status = ma_start; 141 - mas_destroy(&mas); 142 - /* Allocate 3 nodes, will fail. */ 143 - mas_node_count(&mas, 3); 144 - /* Drop the lock and allocate 3 nodes. */ 145 - mas_nomem(&mas, GFP_KERNEL); 146 - /* Ensure 3 are allocated. */ 147 - MT_BUG_ON(mt, mas_allocated(&mas) != 3); 148 - /* Allocation request of 0. */ 149 - MT_BUG_ON(mt, mas_alloc_req(&mas) != 0); 150 - 151 - MT_BUG_ON(mt, mas.alloc == NULL); 152 - MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); 153 - MT_BUG_ON(mt, mas.alloc->slot[1] == NULL); 154 - /* Ensure we counted 3. */ 155 - MT_BUG_ON(mt, mas_allocated(&mas) != 3); 156 - /* Free. */ 157 - mas_reset(&mas); 158 - mas_destroy(&mas); 159 - 160 - /* Set allocation request to 1. */ 161 - mas_set_alloc_req(&mas, 1); 162 - MT_BUG_ON(mt, mas_alloc_req(&mas) != 1); 163 - mas_set_err(&mas, -ENOMEM); 164 - /* Validate allocation request. */ 165 - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); 166 - MT_BUG_ON(mt, mas_allocated(&mas) != 1); 167 - /* Check the node is only one node. */ 168 - mn = mas_pop_node(&mas); 169 - MT_BUG_ON(mt, not_empty(mn)); 170 - MT_BUG_ON(mt, mas_allocated(&mas) != 0); 171 - MT_BUG_ON(mt, mn == NULL); 172 - MT_BUG_ON(mt, mn->slot[0] != NULL); 173 - MT_BUG_ON(mt, mn->slot[1] != NULL); 174 - MT_BUG_ON(mt, mas_allocated(&mas) != 0); 175 - mas_push_node(&mas, mn); 176 - MT_BUG_ON(mt, mas_allocated(&mas) != 1); 177 - MT_BUG_ON(mt, mas.alloc->node_count); 178 - 179 - mas_set_alloc_req(&mas, 2); /* request 2 more. */ 180 - MT_BUG_ON(mt, mas_alloc_req(&mas) != 2); 181 - mas_set_err(&mas, -ENOMEM); 182 - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); 183 - MT_BUG_ON(mt, mas_allocated(&mas) != 3); 184 - MT_BUG_ON(mt, mas.alloc == NULL); 185 - MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); 186 - MT_BUG_ON(mt, mas.alloc->slot[1] == NULL); 187 - for (i = 2; i >= 0; i--) { 188 - mn = mas_pop_node(&mas); 189 - MT_BUG_ON(mt, mas_allocated(&mas) != i); 190 - MT_BUG_ON(mt, !mn); 191 - MT_BUG_ON(mt, not_empty(mn)); 192 - mn->parent = ma_parent_ptr(mn); 193 - ma_free_rcu(mn); 194 - } 195 - 196 - total = 64; 197 - mas_set_alloc_req(&mas, total); /* request 2 more. */ 198 - MT_BUG_ON(mt, mas_alloc_req(&mas) != total); 199 - mas_set_err(&mas, -ENOMEM); 200 - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); 201 - for (i = total; i > 0; i--) { 202 - unsigned int e = 0; /* expected node_count */ 203 - 204 - if (!MAPLE_32BIT) { 205 - if (i >= 35) 206 - e = i - 34; 207 - else if (i >= 5) 208 - e = i - 4; 209 - else if (i >= 2) 210 - e = i - 1; 211 - } else { 212 - if (i >= 4) 213 - e = i - 3; 214 - else if (i >= 1) 215 - e = i - 1; 216 - else 217 - e = 0; 218 - } 219 - 220 - MT_BUG_ON(mt, mas.alloc->node_count != e); 221 - mn = mas_pop_node(&mas); 222 - MT_BUG_ON(mt, not_empty(mn)); 223 - MT_BUG_ON(mt, mas_allocated(&mas) != i - 1); 224 - MT_BUG_ON(mt, !mn); 225 - mn->parent = ma_parent_ptr(mn); 226 - ma_free_rcu(mn); 227 - } 228 - 229 - total = 100; 230 - for (i = 1; i < total; i++) { 231 - mas_set_alloc_req(&mas, i); 232 - mas_set_err(&mas, -ENOMEM); 233 - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); 234 - for (j = i; j > 0; j--) { 235 - mn = mas_pop_node(&mas); 236 - MT_BUG_ON(mt, mas_allocated(&mas) != j - 1); 237 - MT_BUG_ON(mt, !mn); 238 - MT_BUG_ON(mt, not_empty(mn)); 239 - mas_push_node(&mas, mn); 240 - MT_BUG_ON(mt, mas_allocated(&mas) != j); 241 - mn = mas_pop_node(&mas); 242 - MT_BUG_ON(mt, not_empty(mn)); 243 - MT_BUG_ON(mt, mas_allocated(&mas) != j - 1); 244 - mn->parent = ma_parent_ptr(mn); 245 - ma_free_rcu(mn); 246 - } 247 - MT_BUG_ON(mt, mas_allocated(&mas) != 0); 248 - 249 - mas_set_alloc_req(&mas, i); 250 - mas_set_err(&mas, -ENOMEM); 251 - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); 252 - for (j = 0; j <= i/2; j++) { 253 - MT_BUG_ON(mt, mas_allocated(&mas) != i - j); 254 - nodes[j] = mas_pop_node(&mas); 255 - MT_BUG_ON(mt, mas_allocated(&mas) != i - j - 1); 256 - } 257 - 258 - while (j) { 259 - j--; 260 - mas_push_node(&mas, nodes[j]); 261 - MT_BUG_ON(mt, mas_allocated(&mas) != i - j); 262 - } 263 - MT_BUG_ON(mt, mas_allocated(&mas) != i); 264 - for (j = 0; j <= i/2; j++) { 265 - MT_BUG_ON(mt, mas_allocated(&mas) != i - j); 266 - mn = mas_pop_node(&mas); 267 - MT_BUG_ON(mt, not_empty(mn)); 268 - mn->parent = ma_parent_ptr(mn); 269 - ma_free_rcu(mn); 270 - MT_BUG_ON(mt, mas_allocated(&mas) != i - j - 1); 271 - } 272 - mas_reset(&mas); 273 - MT_BUG_ON(mt, mas_nomem(&mas, GFP_KERNEL)); 274 - mas_destroy(&mas); 275 - 276 - } 277 - 278 - /* Set allocation request. */ 279 - total = 500; 280 - mas_node_count(&mas, total); 281 - /* Drop the lock and allocate the nodes. */ 282 - mas_nomem(&mas, GFP_KERNEL); 283 - MT_BUG_ON(mt, !mas.alloc); 284 - i = 1; 285 - smn = mas.alloc; 286 - while (i < total) { 287 - for (j = 0; j < MAPLE_ALLOC_SLOTS; j++) { 288 - i++; 289 - MT_BUG_ON(mt, !smn->slot[j]); 290 - if (i == total) 291 - break; 292 - } 293 - smn = smn->slot[0]; /* next. */ 294 - } 295 - MT_BUG_ON(mt, mas_allocated(&mas) != total); 296 - mas_reset(&mas); 297 - mas_destroy(&mas); /* Free. */ 298 - 299 - MT_BUG_ON(mt, mas_allocated(&mas) != 0); 300 - for (i = 1; i < 128; i++) { 301 - mas_node_count(&mas, i); /* Request */ 302 - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ 303 - MT_BUG_ON(mt, mas_allocated(&mas) != i); /* check request filled */ 304 - for (j = i; j > 0; j--) { /*Free the requests */ 305 - mn = mas_pop_node(&mas); /* get the next node. */ 306 - MT_BUG_ON(mt, mn == NULL); 307 - MT_BUG_ON(mt, not_empty(mn)); 308 - mn->parent = ma_parent_ptr(mn); 309 - ma_free_rcu(mn); 310 - } 311 - MT_BUG_ON(mt, mas_allocated(&mas) != 0); 312 - } 313 - 314 - for (i = 1; i < MAPLE_NODE_MASK + 1; i++) { 315 - MA_STATE(mas2, mt, 0, 0); 316 - mas_node_count(&mas, i); /* Request */ 317 - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ 318 - MT_BUG_ON(mt, mas_allocated(&mas) != i); /* check request filled */ 319 - for (j = 1; j <= i; j++) { /* Move the allocations to mas2 */ 320 - mn = mas_pop_node(&mas); /* get the next node. */ 321 - MT_BUG_ON(mt, mn == NULL); 322 - MT_BUG_ON(mt, not_empty(mn)); 323 - mas_push_node(&mas2, mn); 324 - MT_BUG_ON(mt, mas_allocated(&mas2) != j); 325 - } 326 - MT_BUG_ON(mt, mas_allocated(&mas) != 0); 327 - MT_BUG_ON(mt, mas_allocated(&mas2) != i); 328 - 329 - for (j = i; j > 0; j--) { /*Free the requests */ 330 - MT_BUG_ON(mt, mas_allocated(&mas2) != j); 331 - mn = mas_pop_node(&mas2); /* get the next node. */ 332 - MT_BUG_ON(mt, mn == NULL); 333 - MT_BUG_ON(mt, not_empty(mn)); 334 - mn->parent = ma_parent_ptr(mn); 335 - ma_free_rcu(mn); 336 - } 337 - MT_BUG_ON(mt, mas_allocated(&mas2) != 0); 338 - } 339 - 340 - 341 - MT_BUG_ON(mt, mas_allocated(&mas) != 0); 342 - mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 1); /* Request */ 343 - MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); 344 - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); 345 - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); 346 - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); 347 - 348 - mn = mas_pop_node(&mas); /* get the next node. */ 349 - MT_BUG_ON(mt, mn == NULL); 350 - MT_BUG_ON(mt, not_empty(mn)); 351 - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS); 352 - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); 353 - 354 - mas_push_node(&mas, mn); 355 - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); 356 - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); 357 - 358 - /* Check the limit of pop/push/pop */ 359 - mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 2); /* Request */ 360 - MT_BUG_ON(mt, mas_alloc_req(&mas) != 1); 361 - MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); 362 - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); 363 - MT_BUG_ON(mt, mas_alloc_req(&mas)); 364 - MT_BUG_ON(mt, mas.alloc->node_count != 1); 365 - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); 366 - mn = mas_pop_node(&mas); 367 - MT_BUG_ON(mt, not_empty(mn)); 368 - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); 369 - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); 370 - mas_push_node(&mas, mn); 371 - MT_BUG_ON(mt, mas.alloc->node_count != 1); 372 - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); 373 - mn = mas_pop_node(&mas); 374 - MT_BUG_ON(mt, not_empty(mn)); 375 - mn->parent = ma_parent_ptr(mn); 376 - ma_free_rcu(mn); 377 - for (i = 1; i <= MAPLE_ALLOC_SLOTS + 1; i++) { 378 - mn = mas_pop_node(&mas); 379 - MT_BUG_ON(mt, not_empty(mn)); 380 - mn->parent = ma_parent_ptr(mn); 381 - ma_free_rcu(mn); 382 - } 383 - MT_BUG_ON(mt, mas_allocated(&mas) != 0); 384 - 385 - 386 - for (i = 3; i < MAPLE_NODE_MASK * 3; i++) { 387 - mas.node = MA_ERROR(-ENOMEM); 388 - mas_node_count(&mas, i); /* Request */ 389 - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ 390 - mn = mas_pop_node(&mas); /* get the next node. */ 391 - mas_push_node(&mas, mn); /* put it back */ 392 - mas_destroy(&mas); 393 - 394 - mas.node = MA_ERROR(-ENOMEM); 395 - mas_node_count(&mas, i); /* Request */ 396 - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ 397 - mn = mas_pop_node(&mas); /* get the next node. */ 398 - mn2 = mas_pop_node(&mas); /* get the next node. */ 399 - mas_push_node(&mas, mn); /* put them back */ 400 - mas_push_node(&mas, mn2); 401 - mas_destroy(&mas); 402 - 403 - mas.node = MA_ERROR(-ENOMEM); 404 - mas_node_count(&mas, i); /* Request */ 405 - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ 406 - mn = mas_pop_node(&mas); /* get the next node. */ 407 - mn2 = mas_pop_node(&mas); /* get the next node. */ 408 - mn3 = mas_pop_node(&mas); /* get the next node. */ 409 - mas_push_node(&mas, mn); /* put them back */ 410 - mas_push_node(&mas, mn2); 411 - mas_push_node(&mas, mn3); 412 - mas_destroy(&mas); 413 - 414 - mas.node = MA_ERROR(-ENOMEM); 415 - mas_node_count(&mas, i); /* Request */ 416 - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ 417 - mn = mas_pop_node(&mas); /* get the next node. */ 418 - mn->parent = ma_parent_ptr(mn); 419 - ma_free_rcu(mn); 420 - mas_destroy(&mas); 421 - 422 - mas.node = MA_ERROR(-ENOMEM); 423 - mas_node_count(&mas, i); /* Request */ 424 - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ 425 - mn = mas_pop_node(&mas); /* get the next node. */ 426 - mn->parent = ma_parent_ptr(mn); 427 - ma_free_rcu(mn); 428 - mn = mas_pop_node(&mas); /* get the next node. */ 429 - mn->parent = ma_parent_ptr(mn); 430 - ma_free_rcu(mn); 431 - mn = mas_pop_node(&mas); /* get the next node. */ 432 - mn->parent = ma_parent_ptr(mn); 433 - ma_free_rcu(mn); 434 - mas_destroy(&mas); 435 - } 436 - 437 - mas.node = MA_ERROR(-ENOMEM); 438 - mas_node_count(&mas, 5); /* Request */ 439 - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ 440 - MT_BUG_ON(mt, mas_allocated(&mas) != 5); 441 - mas.node = MA_ERROR(-ENOMEM); 442 - mas_node_count(&mas, 10); /* Request */ 443 - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ 444 - mas.status = ma_start; 445 - MT_BUG_ON(mt, mas_allocated(&mas) != 10); 446 - mas_destroy(&mas); 447 - 448 - mas.node = MA_ERROR(-ENOMEM); 449 - mas_node_count(&mas, MAPLE_ALLOC_SLOTS - 1); /* Request */ 450 - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ 451 - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS - 1); 452 - mas.node = MA_ERROR(-ENOMEM); 453 - mas_node_count(&mas, 10 + MAPLE_ALLOC_SLOTS - 1); /* Request */ 454 - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ 455 - mas.status = ma_start; 456 - MT_BUG_ON(mt, mas_allocated(&mas) != 10 + MAPLE_ALLOC_SLOTS - 1); 457 - mas_destroy(&mas); 458 - 459 - mas.node = MA_ERROR(-ENOMEM); 460 - mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 1); /* Request */ 461 - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ 462 - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); 463 - mas.node = MA_ERROR(-ENOMEM); 464 - mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 2 + 2); /* Request */ 465 - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ 466 - mas.status = ma_start; 467 - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 2 + 2); 468 - mas_destroy(&mas); 469 - 470 - mas.node = MA_ERROR(-ENOMEM); 471 - mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 2 + 1); /* Request */ 472 - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ 473 - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 2 + 1); 474 - mas.node = MA_ERROR(-ENOMEM); 475 - mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 3 + 2); /* Request */ 476 - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ 477 - mas.status = ma_start; 478 - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 3 + 2); 479 - mas_destroy(&mas); 480 - 481 - mtree_unlock(mt); 482 - } 483 65 484 66 /* 485 67 * Check erasing including RCU. ··· 35025 35455 MT_BUG_ON(mt, count != e); 35026 35456 mtree_destroy(mt); 35027 35457 35028 - mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); 35029 - mas_reset(&mas); 35030 - mt_zero_nr_tallocated(); 35031 - mt_set_non_kernel(200); 35032 - mas_expected_entries(&mas, max); 35033 - for (count = 0; count <= max; count++) { 35034 - mas.index = mas.last = count; 35035 - mas_store(&mas, xa_mk_value(count)); 35036 - MT_BUG_ON(mt, mas_is_err(&mas)); 35037 - } 35038 - mas_destroy(&mas); 35039 35458 rcu_barrier(); 35040 35459 /* 35041 35460 * pr_info(" ->seq test of 0-%lu %luK in %d active (%d total)\n", ··· 35083 35524 return vacant_height; 35084 35525 } 35085 35526 35527 + static int mas_allocated(struct ma_state *mas) 35528 + { 35529 + int total = 0; 35530 + 35531 + if (mas->alloc) 35532 + total++; 35533 + 35534 + if (mas->sheaf) 35535 + total += kmem_cache_sheaf_size(mas->sheaf); 35536 + 35537 + return total; 35538 + } 35086 35539 /* Preallocation testing */ 35087 35540 static noinline void __init check_prealloc(struct maple_tree *mt) 35088 35541 { ··· 35113 35542 35114 35543 /* Spanning store */ 35115 35544 mas_set_range(&mas, 470, 500); 35116 - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); 35545 + 35546 + mas_wr_preallocate(&wr_mas, ptr); 35547 + MT_BUG_ON(mt, mas.store_type != wr_spanning_store); 35548 + MT_BUG_ON(mt, mas_is_err(&mas)); 35117 35549 allocated = mas_allocated(&mas); 35118 35550 height = mas_mt_height(&mas); 35119 35551 vacant_height = get_vacant_height(&wr_mas, ptr); ··· 35126 35552 allocated = mas_allocated(&mas); 35127 35553 MT_BUG_ON(mt, allocated != 0); 35128 35554 35555 + mas_wr_preallocate(&wr_mas, ptr); 35129 35556 MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); 35130 35557 allocated = mas_allocated(&mas); 35131 35558 height = mas_mt_height(&mas); ··· 35166 35591 MT_BUG_ON(mt, allocated != 0); 35167 35592 mn->parent = ma_parent_ptr(mn); 35168 35593 ma_free_rcu(mn); 35169 - 35170 - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); 35171 - allocated = mas_allocated(&mas); 35172 - height = mas_mt_height(&mas); 35173 - vacant_height = get_vacant_height(&wr_mas, ptr); 35174 - MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3); 35175 - mn = mas_pop_node(&mas); 35176 - MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); 35177 - mas_push_node(&mas, mn); 35178 - MT_BUG_ON(mt, mas_allocated(&mas) != allocated); 35179 - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); 35180 - mas_destroy(&mas); 35181 - allocated = mas_allocated(&mas); 35182 - MT_BUG_ON(mt, allocated != 0); 35183 35594 35184 35595 MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); 35185 35596 allocated = mas_allocated(&mas); ··· 35967 36406 check_load(mt, 6, xa_mk_value(0xC)); 35968 36407 mtree_unlock(mt); 35969 36408 36409 + mt_set_non_kernel(0); 35970 36410 /* test for the same race but with mas_store_gfp() */ 35971 36411 mtree_store_range(mt, 0, 5, xa_mk_value(0xA), GFP_KERNEL); 35972 36412 mtree_store_range(mt, 6, 10, NULL, GFP_KERNEL); 35973 36413 35974 36414 mas_set_range(&mas, 0, 5); 36415 + 36416 + /* setup writer 2 that will trigger the race condition */ 36417 + mt_set_private(mt); 36418 + mt_set_callback(writer2); 36419 + 35975 36420 mtree_lock(mt); 35976 36421 mas_store_gfp(&mas, NULL, GFP_KERNEL); 35977 36422 ··· 36021 36454 return 0; 36022 36455 } 36023 36456 36024 - /* 36025 - * test to check that bulk stores do not use wr_rebalance as the store 36026 - * type. 36027 - */ 36028 - static inline void check_bulk_rebalance(struct maple_tree *mt) 36029 - { 36030 - MA_STATE(mas, mt, ULONG_MAX, ULONG_MAX); 36031 - int max = 10; 36032 - 36033 - build_full_tree(mt, 0, 2); 36034 - 36035 - /* erase every entry in the tree */ 36036 - do { 36037 - /* set up bulk store mode */ 36038 - mas_expected_entries(&mas, max); 36039 - mas_erase(&mas); 36040 - MT_BUG_ON(mt, mas.store_type == wr_rebalance); 36041 - } while (mas_prev(&mas, 0) != NULL); 36042 - 36043 - mas_destroy(&mas); 36044 - } 36045 36457 36046 36458 void farmer_tests(void) 36047 36459 { ··· 36031 36485 36032 36486 mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | MT_FLAGS_USE_RCU); 36033 36487 check_vma_modification(&tree); 36034 - mtree_destroy(&tree); 36035 - 36036 - mt_init(&tree); 36037 - check_bulk_rebalance(&tree); 36038 36488 mtree_destroy(&tree); 36039 36489 36040 36490 tree.ma_root = xa_mk_value(0); ··· 36090 36548 /* RCU testing */ 36091 36549 mt_init_flags(&tree, 0); 36092 36550 check_erase_testset(&tree); 36093 - mtree_destroy(&tree); 36094 - 36095 - mt_init_flags(&tree, 0); 36096 - check_new_node(&tree); 36097 36551 mtree_destroy(&tree); 36098 36552 36099 36553 if (!MAPLE_32BIT) {
+100 -20
tools/testing/shared/linux.c
··· 16 16 int preempt_count; 17 17 int test_verbose; 18 18 19 - struct kmem_cache { 20 - pthread_mutex_t lock; 21 - unsigned int size; 22 - unsigned int align; 23 - int nr_objs; 24 - void *objs; 25 - void (*ctor)(void *); 26 - unsigned int non_kernel; 27 - unsigned long nr_allocated; 28 - unsigned long nr_tallocated; 29 - bool exec_callback; 30 - void (*callback)(void *); 31 - void *private; 32 - }; 33 - 34 19 void kmem_cache_set_callback(struct kmem_cache *cachep, void (*callback)(void *)) 35 20 { 36 21 cachep->callback = callback; ··· 64 79 65 80 if (!(gfp & __GFP_DIRECT_RECLAIM)) { 66 81 if (!cachep->non_kernel) { 67 - cachep->exec_callback = true; 82 + if (cachep->callback) 83 + cachep->exec_callback = true; 68 84 return NULL; 69 85 } 70 86 ··· 138 152 if (kmalloc_verbose) 139 153 pr_debug("Bulk free %p[0-%zu]\n", list, size - 1); 140 154 155 + if (cachep->exec_callback) { 156 + if (cachep->callback) 157 + cachep->callback(cachep->private); 158 + cachep->exec_callback = false; 159 + } 160 + 141 161 pthread_mutex_lock(&cachep->lock); 142 162 for (int i = 0; i < size; i++) 143 163 kmem_cache_free_locked(cachep, list[i]); ··· 211 219 for (i = 0; i < size; i++) 212 220 __kmem_cache_free_locked(cachep, p[i]); 213 221 pthread_mutex_unlock(&cachep->lock); 222 + if (cachep->callback) 223 + cachep->exec_callback = true; 214 224 return 0; 215 225 } 216 226 ··· 228 234 } 229 235 230 236 struct kmem_cache * 231 - kmem_cache_create(const char *name, unsigned int size, unsigned int align, 232 - unsigned int flags, void (*ctor)(void *)) 237 + __kmem_cache_create_args(const char *name, unsigned int size, 238 + struct kmem_cache_args *args, 239 + unsigned int flags) 233 240 { 234 241 struct kmem_cache *ret = malloc(sizeof(*ret)); 235 242 236 243 pthread_mutex_init(&ret->lock, NULL); 237 244 ret->size = size; 238 - ret->align = align; 245 + ret->align = args->align; 246 + ret->sheaf_capacity = args->sheaf_capacity; 239 247 ret->nr_objs = 0; 240 248 ret->nr_allocated = 0; 241 249 ret->nr_tallocated = 0; 242 250 ret->objs = NULL; 243 - ret->ctor = ctor; 251 + ret->ctor = args->ctor; 244 252 ret->non_kernel = 0; 245 253 ret->exec_callback = false; 246 254 ret->callback = NULL; 247 255 ret->private = NULL; 256 + 248 257 return ret; 258 + } 259 + 260 + struct slab_sheaf * 261 + kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) 262 + { 263 + struct slab_sheaf *sheaf; 264 + unsigned int capacity; 265 + 266 + if (s->exec_callback) { 267 + if (s->callback) 268 + s->callback(s->private); 269 + s->exec_callback = false; 270 + } 271 + 272 + capacity = max(size, s->sheaf_capacity); 273 + 274 + sheaf = calloc(1, sizeof(*sheaf) + sizeof(void *) * capacity); 275 + if (!sheaf) 276 + return NULL; 277 + 278 + sheaf->cache = s; 279 + sheaf->capacity = capacity; 280 + sheaf->size = kmem_cache_alloc_bulk(s, gfp, size, sheaf->objects); 281 + if (!sheaf->size) { 282 + free(sheaf); 283 + return NULL; 284 + } 285 + 286 + return sheaf; 287 + } 288 + 289 + int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, 290 + struct slab_sheaf **sheafp, unsigned int size) 291 + { 292 + struct slab_sheaf *sheaf = *sheafp; 293 + int refill; 294 + 295 + if (sheaf->size >= size) 296 + return 0; 297 + 298 + if (size > sheaf->capacity) { 299 + sheaf = kmem_cache_prefill_sheaf(s, gfp, size); 300 + if (!sheaf) 301 + return -ENOMEM; 302 + 303 + kmem_cache_return_sheaf(s, gfp, *sheafp); 304 + *sheafp = sheaf; 305 + return 0; 306 + } 307 + 308 + refill = kmem_cache_alloc_bulk(s, gfp, size - sheaf->size, 309 + &sheaf->objects[sheaf->size]); 310 + if (!refill) 311 + return -ENOMEM; 312 + 313 + sheaf->size += refill; 314 + return 0; 315 + } 316 + 317 + void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, 318 + struct slab_sheaf *sheaf) 319 + { 320 + if (sheaf->size) 321 + kmem_cache_free_bulk(s, sheaf->size, &sheaf->objects[0]); 322 + 323 + free(sheaf); 324 + } 325 + 326 + void * 327 + kmem_cache_alloc_from_sheaf(struct kmem_cache *s, gfp_t gfp, 328 + struct slab_sheaf *sheaf) 329 + { 330 + void *obj; 331 + 332 + if (sheaf->size == 0) { 333 + printf("Nothing left in sheaf!\n"); 334 + return NULL; 335 + } 336 + 337 + obj = sheaf->objects[--sheaf->size]; 338 + sheaf->objects[sheaf->size] = NULL; 339 + 340 + return obj; 249 341 } 250 342 251 343 /*
+11
tools/testing/shared/maple-shared.h
··· 10 10 #include <time.h> 11 11 #include "linux/init.h" 12 12 13 + void maple_rcu_cb(struct rcu_head *head); 14 + #define rcu_cb maple_rcu_cb 15 + 16 + #define kfree_rcu(_struct, _memb) \ 17 + do { \ 18 + typeof(_struct) _p_struct = (_struct); \ 19 + \ 20 + call_rcu(&((_p_struct)->_memb), rcu_cb); \ 21 + } while(0); 22 + 23 + 13 24 #endif /* __MAPLE_SHARED_H__ */
+7
tools/testing/shared/maple-shim.c
··· 3 3 /* Very simple shim around the maple tree. */ 4 4 5 5 #include "maple-shared.h" 6 + #include <linux/slab.h> 6 7 7 8 #include "../../../lib/maple_tree.c" 9 + 10 + void maple_rcu_cb(struct rcu_head *head) { 11 + struct maple_node *node = container_of(head, struct maple_node, rcu); 12 + 13 + kmem_cache_free(maple_node_cache, node); 14 + }
+59 -200
tools/testing/vma/vma_internal.h
··· 26 26 #include <linux/mm.h> 27 27 #include <linux/rbtree.h> 28 28 #include <linux/refcount.h> 29 + #include <linux/slab.h> 29 30 30 31 extern unsigned long stack_guard_gap; 31 32 #ifdef CONFIG_MMU ··· 510 509 .len_in = len_, \ 511 510 } 512 511 513 - struct kmem_cache_args { 514 - /** 515 - * @align: The required alignment for the objects. 516 - * 517 - * %0 means no specific alignment is requested. 518 - */ 519 - unsigned int align; 520 - /** 521 - * @useroffset: Usercopy region offset. 522 - * 523 - * %0 is a valid offset, when @usersize is non-%0 524 - */ 525 - unsigned int useroffset; 526 - /** 527 - * @usersize: Usercopy region size. 528 - * 529 - * %0 means no usercopy region is specified. 530 - */ 531 - unsigned int usersize; 532 - /** 533 - * @freeptr_offset: Custom offset for the free pointer 534 - * in &SLAB_TYPESAFE_BY_RCU caches 535 - * 536 - * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer 537 - * outside of the object. This might cause the object to grow in size. 538 - * Cache creators that have a reason to avoid this can specify a custom 539 - * free pointer offset in their struct where the free pointer will be 540 - * placed. 541 - * 542 - * Note that placing the free pointer inside the object requires the 543 - * caller to ensure that no fields are invalidated that are required to 544 - * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for 545 - * details). 546 - * 547 - * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset 548 - * is specified, %use_freeptr_offset must be set %true. 549 - * 550 - * Note that @ctor currently isn't supported with custom free pointers 551 - * as a @ctor requires an external free pointer. 552 - */ 553 - unsigned int freeptr_offset; 554 - /** 555 - * @use_freeptr_offset: Whether a @freeptr_offset is used. 556 - */ 557 - bool use_freeptr_offset; 558 - /** 559 - * @ctor: A constructor for the objects. 560 - * 561 - * The constructor is invoked for each object in a newly allocated slab 562 - * page. It is the cache user's responsibility to free object in the 563 - * same state as after calling the constructor, or deal appropriately 564 - * with any differences between a freshly constructed and a reallocated 565 - * object. 566 - * 567 - * %NULL means no constructor. 568 - */ 569 - void (*ctor)(void *); 570 - }; 571 - 572 512 static inline void vma_iter_invalidate(struct vma_iterator *vmi) 573 513 { 574 514 mas_pause(&vmi->mas); ··· 592 650 vma->vm_ops = &vma_dummy_vm_ops; 593 651 INIT_LIST_HEAD(&vma->anon_vma_chain); 594 652 vma->vm_lock_seq = UINT_MAX; 595 - } 596 - 597 - struct kmem_cache { 598 - const char *name; 599 - size_t object_size; 600 - struct kmem_cache_args *args; 601 - }; 602 - 603 - static inline struct kmem_cache *__kmem_cache_create(const char *name, 604 - size_t object_size, 605 - struct kmem_cache_args *args) 606 - { 607 - struct kmem_cache *ret = malloc(sizeof(struct kmem_cache)); 608 - 609 - ret->name = name; 610 - ret->object_size = object_size; 611 - ret->args = args; 612 - 613 - return ret; 614 - } 615 - 616 - #define kmem_cache_create(__name, __object_size, __args, ...) \ 617 - __kmem_cache_create((__name), (__object_size), (__args)) 618 - 619 - static inline void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 620 - { 621 - (void)gfpflags; 622 - 623 - return calloc(s->object_size, 1); 624 - } 625 - 626 - static inline void kmem_cache_free(struct kmem_cache *s, void *x) 627 - { 628 - free(x); 629 653 } 630 654 631 655 /* ··· 750 842 return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; 751 843 } 752 844 753 - static inline void fput(struct file *) 845 + static inline void fput(struct file *file) 754 846 { 755 847 } 756 848 757 - static inline void mpol_put(struct mempolicy *) 849 + static inline void mpol_put(struct mempolicy *pol) 758 850 { 759 851 } 760 852 ··· 762 854 { 763 855 } 764 856 765 - static inline void tlb_gather_mmu(struct mmu_gather *, struct mm_struct *) 857 + static inline void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) 766 858 { 767 859 } 768 860 769 - static inline void update_hiwater_rss(struct mm_struct *) 861 + static inline void update_hiwater_rss(struct mm_struct *mm) 770 862 { 771 863 } 772 864 773 - static inline void update_hiwater_vm(struct mm_struct *) 865 + static inline void update_hiwater_vm(struct mm_struct *mm) 774 866 { 775 867 } 776 868 ··· 779 871 unsigned long end_addr, unsigned long tree_end, 780 872 bool mm_wr_locked) 781 873 { 782 - (void)tlb; 783 - (void)mas; 784 - (void)vma; 785 - (void)start_addr; 786 - (void)end_addr; 787 - (void)tree_end; 788 - (void)mm_wr_locked; 789 874 } 790 875 791 876 static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, 792 877 struct vm_area_struct *vma, unsigned long floor, 793 878 unsigned long ceiling, bool mm_wr_locked) 794 879 { 795 - (void)tlb; 796 - (void)mas; 797 - (void)vma; 798 - (void)floor; 799 - (void)ceiling; 800 - (void)mm_wr_locked; 801 880 } 802 881 803 - static inline void mapping_unmap_writable(struct address_space *) 882 + static inline void mapping_unmap_writable(struct address_space *mapping) 804 883 { 805 884 } 806 885 807 - static inline void flush_dcache_mmap_lock(struct address_space *) 886 + static inline void flush_dcache_mmap_lock(struct address_space *mapping) 808 887 { 809 888 } 810 889 811 - static inline void tlb_finish_mmu(struct mmu_gather *) 890 + static inline void tlb_finish_mmu(struct mmu_gather *tlb) 812 891 { 813 892 } 814 893 ··· 804 909 return f; 805 910 } 806 911 807 - static inline int vma_dup_policy(struct vm_area_struct *, struct vm_area_struct *) 912 + static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) 808 913 { 809 914 return 0; 810 915 } ··· 831 936 unsigned long end, 832 937 struct vm_area_struct *next) 833 938 { 834 - (void)vma; 835 - (void)start; 836 - (void)end; 837 - (void)next; 838 939 } 839 940 840 941 static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {} ··· 850 959 { 851 960 } 852 961 853 - static inline void vma_interval_tree_insert(struct vm_area_struct *, 854 - struct rb_root_cached *) 962 + static inline void vma_interval_tree_insert(struct vm_area_struct *vma, 963 + struct rb_root_cached *rb) 855 964 { 856 965 } 857 966 858 - static inline void vma_interval_tree_remove(struct vm_area_struct *, 859 - struct rb_root_cached *) 967 + static inline void vma_interval_tree_remove(struct vm_area_struct *vma, 968 + struct rb_root_cached *rb) 860 969 { 861 970 } 862 971 863 - static inline void flush_dcache_mmap_unlock(struct address_space *) 972 + static inline void flush_dcache_mmap_unlock(struct address_space *mapping) 864 973 { 865 974 } 866 975 867 - static inline void anon_vma_interval_tree_insert(struct anon_vma_chain*, 868 - struct rb_root_cached *) 976 + static inline void anon_vma_interval_tree_insert(struct anon_vma_chain *avc, 977 + struct rb_root_cached *rb) 869 978 { 870 979 } 871 980 872 - static inline void anon_vma_interval_tree_remove(struct anon_vma_chain*, 873 - struct rb_root_cached *) 981 + static inline void anon_vma_interval_tree_remove(struct anon_vma_chain *avc, 982 + struct rb_root_cached *rb) 874 983 { 875 984 } 876 985 877 - static inline void uprobe_mmap(struct vm_area_struct *) 986 + static inline void uprobe_mmap(struct vm_area_struct *vma) 878 987 { 879 988 } 880 989 881 990 static inline void uprobe_munmap(struct vm_area_struct *vma, 882 991 unsigned long start, unsigned long end) 883 992 { 884 - (void)vma; 885 - (void)start; 886 - (void)end; 887 993 } 888 994 889 - static inline void i_mmap_lock_write(struct address_space *) 995 + static inline void i_mmap_lock_write(struct address_space *mapping) 890 996 { 891 997 } 892 998 893 - static inline void anon_vma_lock_write(struct anon_vma *) 999 + static inline void anon_vma_lock_write(struct anon_vma *anon_vma) 894 1000 { 895 1001 } 896 1002 897 - static inline void vma_assert_write_locked(struct vm_area_struct *) 1003 + static inline void vma_assert_write_locked(struct vm_area_struct *vma) 898 1004 { 899 1005 } 900 1006 ··· 901 1013 vma->anon_vma->was_unlinked = true; 902 1014 } 903 1015 904 - static inline void anon_vma_unlock_write(struct anon_vma *) 1016 + static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) 905 1017 { 906 1018 } 907 1019 908 - static inline void i_mmap_unlock_write(struct address_space *) 1020 + static inline void i_mmap_unlock_write(struct address_space *mapping) 909 1021 { 910 1022 } 911 1023 912 - static inline void anon_vma_merge(struct vm_area_struct *, 913 - struct vm_area_struct *) 1024 + static inline void anon_vma_merge(struct vm_area_struct *vma, 1025 + struct vm_area_struct *next) 914 1026 { 915 1027 } 916 1028 ··· 919 1031 unsigned long end, 920 1032 struct list_head *unmaps) 921 1033 { 922 - (void)vma; 923 - (void)start; 924 - (void)end; 925 - (void)unmaps; 926 - 927 1034 return 0; 928 1035 } 929 1036 930 - static inline void mmap_write_downgrade(struct mm_struct *) 1037 + static inline void mmap_write_downgrade(struct mm_struct *mm) 931 1038 { 932 1039 } 933 1040 934 - static inline void mmap_read_unlock(struct mm_struct *) 1041 + static inline void mmap_read_unlock(struct mm_struct *mm) 935 1042 { 936 1043 } 937 1044 938 - static inline void mmap_write_unlock(struct mm_struct *) 1045 + static inline void mmap_write_unlock(struct mm_struct *mm) 939 1046 { 940 1047 } 941 1048 942 - static inline int mmap_write_lock_killable(struct mm_struct *) 1049 + static inline int mmap_write_lock_killable(struct mm_struct *mm) 943 1050 { 944 1051 return 0; 945 1052 } ··· 943 1060 unsigned long start, 944 1061 unsigned long end) 945 1062 { 946 - (void)mm; 947 - (void)start; 948 - (void)end; 949 - 950 1063 return true; 951 1064 } 952 1065 ··· 950 1071 unsigned long start, 951 1072 unsigned long end) 952 1073 { 953 - (void)mm; 954 - (void)start; 955 - (void)end; 956 1074 } 957 1075 958 - static inline void mmap_assert_locked(struct mm_struct *) 1076 + static inline void mmap_assert_locked(struct mm_struct *mm) 959 1077 { 960 1078 } 961 1079 962 - static inline bool mpol_equal(struct mempolicy *, struct mempolicy *) 1080 + static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) 963 1081 { 964 1082 return true; 965 1083 } ··· 964 1088 static inline void khugepaged_enter_vma(struct vm_area_struct *vma, 965 1089 vm_flags_t vm_flags) 966 1090 { 967 - (void)vma; 968 - (void)vm_flags; 969 1091 } 970 1092 971 - static inline bool mapping_can_writeback(struct address_space *) 1093 + static inline bool mapping_can_writeback(struct address_space *mapping) 972 1094 { 973 1095 return true; 974 1096 } 975 1097 976 - static inline bool is_vm_hugetlb_page(struct vm_area_struct *) 1098 + static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) 977 1099 { 978 1100 return false; 979 1101 } 980 1102 981 - static inline bool vma_soft_dirty_enabled(struct vm_area_struct *) 1103 + static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) 982 1104 { 983 1105 return false; 984 1106 } 985 1107 986 - static inline bool userfaultfd_wp(struct vm_area_struct *) 1108 + static inline bool userfaultfd_wp(struct vm_area_struct *vma) 987 1109 { 988 1110 return false; 989 1111 } 990 1112 991 - static inline void mmap_assert_write_locked(struct mm_struct *) 1113 + static inline void mmap_assert_write_locked(struct mm_struct *mm) 992 1114 { 993 1115 } 994 1116 995 - static inline void mutex_lock(struct mutex *) 1117 + static inline void mutex_lock(struct mutex *lock) 996 1118 { 997 1119 } 998 1120 999 - static inline void mutex_unlock(struct mutex *) 1121 + static inline void mutex_unlock(struct mutex *lock) 1000 1122 { 1001 1123 } 1002 1124 1003 - static inline bool mutex_is_locked(struct mutex *) 1125 + static inline bool mutex_is_locked(struct mutex *lock) 1004 1126 { 1005 1127 return true; 1006 1128 } 1007 1129 1008 - static inline bool signal_pending(void *) 1130 + static inline bool signal_pending(void *p) 1009 1131 { 1010 1132 return false; 1011 1133 } 1012 1134 1013 - static inline bool is_file_hugepages(struct file *) 1135 + static inline bool is_file_hugepages(struct file *file) 1014 1136 { 1015 1137 return false; 1016 1138 } 1017 1139 1018 - static inline int security_vm_enough_memory_mm(struct mm_struct *, long) 1140 + static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) 1019 1141 { 1020 1142 return 0; 1021 1143 } 1022 1144 1023 - static inline bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long) 1145 + static inline bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, 1146 + unsigned long npages) 1024 1147 { 1025 1148 return true; 1026 1149 } ··· 1044 1169 vma->__vm_flags &= ~flags; 1045 1170 } 1046 1171 1047 - static inline int shmem_zero_setup(struct vm_area_struct *) 1172 + static inline int shmem_zero_setup(struct vm_area_struct *vma) 1048 1173 { 1049 1174 return 0; 1050 1175 } ··· 1054 1179 vma->vm_ops = NULL; 1055 1180 } 1056 1181 1057 - static inline void ksm_add_vma(struct vm_area_struct *) 1182 + static inline void ksm_add_vma(struct vm_area_struct *vma) 1058 1183 { 1059 1184 } 1060 1185 1061 - static inline void perf_event_mmap(struct vm_area_struct *) 1186 + static inline void perf_event_mmap(struct vm_area_struct *vma) 1062 1187 { 1063 1188 } 1064 1189 1065 - static inline bool vma_is_dax(struct vm_area_struct *) 1190 + static inline bool vma_is_dax(struct vm_area_struct *vma) 1066 1191 { 1067 1192 return false; 1068 1193 } 1069 1194 1070 - static inline struct vm_area_struct *get_gate_vma(struct mm_struct *) 1195 + static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm) 1071 1196 { 1072 1197 return NULL; 1073 1198 } ··· 1092 1217 WRITE_ONCE(vma->vm_page_prot, vm_page_prot); 1093 1218 } 1094 1219 1095 - static inline bool arch_validate_flags(vm_flags_t) 1220 + static inline bool arch_validate_flags(vm_flags_t flags) 1096 1221 { 1097 1222 return true; 1098 1223 } 1099 1224 1100 - static inline void vma_close(struct vm_area_struct *) 1225 + static inline void vma_close(struct vm_area_struct *vma) 1101 1226 { 1102 1227 } 1103 1228 1104 - static inline int mmap_file(struct file *, struct vm_area_struct *) 1229 + static inline int mmap_file(struct file *file, struct vm_area_struct *vma) 1105 1230 { 1106 1231 return 0; 1107 1232 } ··· 1263 1388 1264 1389 static inline unsigned long move_page_tables(struct pagetable_move_control *pmc) 1265 1390 { 1266 - (void)pmc; 1267 - 1268 1391 return 0; 1269 1392 } 1270 1393 ··· 1270 1397 unsigned long addr, unsigned long end, 1271 1398 unsigned long floor, unsigned long ceiling) 1272 1399 { 1273 - (void)tlb; 1274 - (void)addr; 1275 - (void)end; 1276 - (void)floor; 1277 - (void)ceiling; 1278 1400 } 1279 1401 1280 1402 static inline int ksm_execve(struct mm_struct *mm) 1281 1403 { 1282 - (void)mm; 1283 - 1284 1404 return 0; 1285 1405 } 1286 1406 1287 1407 static inline void ksm_exit(struct mm_struct *mm) 1288 1408 { 1289 - (void)mm; 1290 1409 } 1291 1410 1292 1411 static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) 1293 1412 { 1294 - (void)vma; 1295 - (void)reset_refcnt; 1413 + if (reset_refcnt) 1414 + refcount_set(&vma->vm_refcnt, 0); 1296 1415 } 1297 1416 1298 1417 static inline void vma_numab_state_init(struct vm_area_struct *vma) 1299 1418 { 1300 - (void)vma; 1301 1419 } 1302 1420 1303 1421 static inline void vma_numab_state_free(struct vm_area_struct *vma) 1304 1422 { 1305 - (void)vma; 1306 1423 } 1307 1424 1308 1425 static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, 1309 1426 struct vm_area_struct *new_vma) 1310 1427 { 1311 - (void)orig_vma; 1312 - (void)new_vma; 1313 1428 } 1314 1429 1315 1430 static inline void free_anon_vma_name(struct vm_area_struct *vma) 1316 1431 { 1317 - (void)vma; 1318 1432 } 1319 1433 1320 1434 /* Declared in vma.h. */ ··· 1355 1495 1356 1496 static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) 1357 1497 { 1358 - (void)vma; 1359 1498 } 1360 1499 1361 1500 static inline void vma_set_file(struct vm_area_struct *vma, struct file *file) ··· 1365 1506 fput(file); 1366 1507 } 1367 1508 1368 - static inline bool shmem_file(struct file *) 1509 + static inline bool shmem_file(struct file *file) 1369 1510 { 1370 1511 return false; 1371 1512 } 1372 1513 1373 - static inline vm_flags_t ksm_vma_flags(const struct mm_struct *, const struct file *, 1374 - vm_flags_t vm_flags) 1514 + static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, 1515 + const struct file *file, vm_flags_t vm_flags) 1375 1516 { 1376 1517 return vm_flags; 1377 1518 }