fork: define a local GFP_VMAP_STACK

The current allocation of VMAP stack memory is using (THREADINFO_GFP &
~__GFP_ACCOUNT) which is a complicated way of saying (GFP_KERNEL |
__GFP_ZERO):

<linux/thread_info.h>:
define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
<linux/gfp_types.h>:
define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)

This is an unfortunate side-effect of independent changes blurring the
picture:

commit 19809c2da28aee5860ad9a2eff760730a0710df0 changed (THREADINFO_GFP |
__GFP_HIGHMEM) to just THREADINFO_GFP since highmem became implicit.

commit 9b6f7e163cd0f468d1b9696b785659d3c27c8667 then added stack caching
and rewrote the allocation to (THREADINFO_GFP & ~__GFP_ACCOUNT) as cached
stacks need to be accounted separately. However that code, when it
eventually accounts the memory does this:

ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0)

so the memory is charged as a GFP_KERNEL allocation.

Define a unique GFP_VMAP_STACK to use
GFP_KERNEL | __GFP_ZERO and move the comment there.

Link: https://lkml.kernel.org/r/20250509-gfp-stack-v1-1-82f6f7efc210@linaro.org
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reported-by: Mateusz Guzik <mjguzik@gmail.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Linus Walleij and committed by

Andrew Morton 1 year ago 8e02b1b7 d82893c5

+45 -43

1 changed file

expand all

kernel

fork.c

+45 -43

kernel/fork.c

··· 185 185 kmem_cache_free(task_struct_cachep, tsk); 186 186 } 187 187 188 - #ifdef CONFIG_VMAP_STACK 188 + /* 189 + * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a 190 + * kmemcache based allocator. 191 + */ 192 + # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) 193 + 194 + # ifdef CONFIG_VMAP_STACK 189 195 /* 190 196 * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB 191 197 * flush. Try to minimize the number of calls by caching stacks. ··· 204 198 struct vm_struct *stack_vm_area; 205 199 }; 206 200 207 - static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area) 201 + static bool try_release_thread_stack_to_cache(struct vm_struct *vm) 208 202 { 209 203 unsigned int i; 210 204 211 205 for (i = 0; i < NR_CACHED_STACKS; i++) { 212 206 struct vm_struct *tmp = NULL; 213 207 214 - if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area)) 208 + if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm)) 215 209 return true; 216 210 } 217 211 return false; ··· 220 214 static void thread_stack_free_rcu(struct rcu_head *rh) 221 215 { 222 216 struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu); 223 - struct vm_struct *vm_area = vm_stack->stack_vm_area; 224 217 225 218 if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area)) 226 219 return; 227 220 228 - vfree(vm_area->addr); 221 + vfree(vm_stack); 229 222 } 230 223 231 224 static void thread_stack_delayed_free(struct task_struct *tsk) ··· 237 232 238 233 static int free_vm_stack_cache(unsigned int cpu) 239 234 { 240 - struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu); 235 + struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); 241 236 int i; 242 237 243 238 for (i = 0; i < NR_CACHED_STACKS; i++) { 244 - struct vm_struct *vm_area = cached_vm_stack_areas[i]; 239 + struct vm_struct *vm_stack = cached_vm_stacks[i]; 245 240 246 - if (!vm_area) 241 + if (!vm_stack) 247 242 continue; 248 243 249 - vfree(vm_area->addr); 250 - cached_vm_stack_areas[i] = NULL; 244 + vfree(vm_stack->addr); 245 + cached_vm_stacks[i] = NULL; 251 246 } 252 247 253 248 return 0; 254 249 } 255 250 256 - static int memcg_charge_kernel_stack(struct vm_struct *vm_area) 251 + static int memcg_charge_kernel_stack(struct vm_struct *vm) 257 252 { 258 253 int i; 259 254 int ret; 260 255 int nr_charged = 0; 261 256 262 - BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE); 257 + BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); 263 258 264 259 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { 265 - ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0); 260 + ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); 266 261 if (ret) 267 262 goto err; 268 263 nr_charged++; ··· 270 265 return 0; 271 266 err: 272 267 for (i = 0; i < nr_charged; i++) 273 - memcg_kmem_uncharge_page(vm_area->pages[i], 0); 268 + memcg_kmem_uncharge_page(vm->pages[i], 0); 274 269 return ret; 275 270 } 276 271 277 272 static int alloc_thread_stack_node(struct task_struct *tsk, int node) 278 273 { 279 - struct vm_struct *vm_area; 274 + struct vm_struct *vm; 280 275 void *stack; 281 276 int i; 282 277 283 278 for (i = 0; i < NR_CACHED_STACKS; i++) { 284 - vm_area = this_cpu_xchg(cached_stacks[i], NULL); 285 - if (!vm_area) 279 + struct vm_struct *s; 280 + 281 + s = this_cpu_xchg(cached_stacks[i], NULL); 282 + 283 + if (!s) 286 284 continue; 287 285 288 - if (memcg_charge_kernel_stack(vm_area)) { 289 - vfree(vm_area->addr); 290 - return -ENOMEM; 291 - } 292 - 293 286 /* Reset stack metadata. */ 294 - kasan_unpoison_range(vm_area->addr, THREAD_SIZE); 287 + kasan_unpoison_range(s->addr, THREAD_SIZE); 295 288 296 - stack = kasan_reset_tag(vm_area->addr); 289 + stack = kasan_reset_tag(s->addr); 297 290 298 291 /* Clear stale pointers from reused stack. */ 299 292 memset(stack, 0, THREAD_SIZE); 300 293 301 - tsk->stack_vm_area = vm_area; 294 + if (memcg_charge_kernel_stack(s)) { 295 + vfree(s->addr); 296 + return -ENOMEM; 297 + } 298 + 299 + tsk->stack_vm_area = s; 302 300 tsk->stack = stack; 303 301 return 0; 304 302 } ··· 317 309 if (!stack) 318 310 return -ENOMEM; 319 311 320 - vm_area = find_vm_area(stack); 321 - if (memcg_charge_kernel_stack(vm_area)) { 312 + vm = find_vm_area(stack); 313 + if (memcg_charge_kernel_stack(vm)) { 322 314 vfree(stack); 323 315 return -ENOMEM; 324 316 } ··· 327 319 * free_thread_stack() can be called in interrupt context, 328 320 * so cache the vm_struct. 329 321 */ 330 - tsk->stack_vm_area = vm_area; 322 + tsk->stack_vm_area = vm; 331 323 stack = kasan_reset_tag(stack); 332 324 tsk->stack = stack; 333 325 return 0; ··· 342 334 tsk->stack_vm_area = NULL; 343 335 } 344 336 345 - #else /* !CONFIG_VMAP_STACK */ 346 - 347 - /* 348 - * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a 349 - * kmemcache based allocator. 350 - */ 351 - #if THREAD_SIZE >= PAGE_SIZE 337 + # else /* !CONFIG_VMAP_STACK */ 352 338 353 339 static void thread_stack_free_rcu(struct rcu_head *rh) 354 340 { ··· 374 372 tsk->stack = NULL; 375 373 } 376 374 377 - #else /* !(THREAD_SIZE >= PAGE_SIZE) */ 375 + # endif /* CONFIG_VMAP_STACK */ 376 + # else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */ 378 377 379 378 static struct kmem_cache *thread_stack_cache; 380 379 ··· 414 411 BUG_ON(thread_stack_cache == NULL); 415 412 } 416 413 417 - #endif /* THREAD_SIZE >= PAGE_SIZE */ 418 - #endif /* CONFIG_VMAP_STACK */ 414 + # endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ 419 415 420 416 /* SLAB cache for signal_struct structures (tsk->signal) */ 421 417 static struct kmem_cache *signal_cachep; ··· 517 515 static void account_kernel_stack(struct task_struct *tsk, int account) 518 516 { 519 517 if (IS_ENABLED(CONFIG_VMAP_STACK)) { 520 - struct vm_struct *vm_area = task_stack_vm_area(tsk); 518 + struct vm_struct *vm = task_stack_vm_area(tsk); 521 519 int i; 522 520 523 521 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) 524 - mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB, 522 + mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, 525 523 account * (PAGE_SIZE / 1024)); 526 524 } else { 527 525 void *stack = task_stack_page(tsk); ··· 537 535 account_kernel_stack(tsk, -1); 538 536 539 537 if (IS_ENABLED(CONFIG_VMAP_STACK)) { 540 - struct vm_struct *vm_area; 538 + struct vm_struct *vm; 541 539 int i; 542 540 543 - vm_area = task_stack_vm_area(tsk); 541 + vm = task_stack_vm_area(tsk); 544 542 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) 545 - memcg_kmem_uncharge_page(vm_area->pages[i], 0); 543 + memcg_kmem_uncharge_page(vm->pages[i], 0); 546 544 } 547 545 } 548 546

Configure Feed

Configure Feed