slab: remove cpu (partial) slabs usage from allocation paths

+87 -541

1 changed file

expand all

slub.c

+87 -541

mm/slub.c

··· 241 241 static DEFINE_STATIC_KEY_FALSE(strict_numa); 242 242 #endif 243 243 244 - /* Structure holding parameters for get_partial() call chain */ 244 + /* Structure holding parameters for get_from_partial() call chain */ 245 245 struct partial_context { 246 246 gfp_t flags; 247 247 unsigned int orig_size; 248 - void *object; 249 248 }; 250 249 251 250 /* Structure holding parameters for get_partial_node_bulk() */ ··· 603 604 return freelist_ptr_decode(s, p, ptr_addr); 604 605 } 605 606 606 - static void prefetch_freepointer(const struct kmem_cache *s, void *object) 607 - { 608 - prefetchw(object + s->offset); 609 - } 610 - 611 - /* 612 - * When running under KMSAN, get_freepointer_safe() may return an uninitialized 613 - * pointer value in the case the current thread loses the race for the next 614 - * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in 615 - * slab_alloc_node() will fail, so the uninitialized value won't be used, but 616 - * KMSAN will still check all arguments of cmpxchg because of imperfect 617 - * handling of inline assembly. 618 - * To work around this problem, we apply __no_kmsan_checks to ensure that 619 - * get_freepointer_safe() returns initialized memory. 620 - */ 621 - __no_kmsan_checks 622 - static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) 623 - { 624 - unsigned long freepointer_addr; 625 - freeptr_t p; 626 - 627 - if (!debug_pagealloc_enabled_static()) 628 - return get_freepointer(s, object); 629 - 630 - object = kasan_reset_tag(object); 631 - freepointer_addr = (unsigned long)object + s->offset; 632 - copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p)); 633 - return freelist_ptr_decode(s, p, freepointer_addr); 634 - } 635 - 636 607 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) 637 608 { 638 609 unsigned long freeptr_addr = (unsigned long)object + s->offset; ··· 682 713 nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); 683 714 s->cpu_partial_slabs = nr_slabs; 684 715 } 685 - 686 - static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) 687 - { 688 - return s->cpu_partial_slabs; 689 - } 690 - #else 691 - #ifdef SLAB_SUPPORTS_SYSFS 716 + #elif defined(SLAB_SUPPORTS_SYSFS) 692 717 static inline void 693 718 slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) 694 719 { 695 - } 696 - #endif 697 - 698 - static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) 699 - { 700 - return 0; 701 720 } 702 721 #endif /* CONFIG_SLUB_CPU_PARTIAL */ 703 722 ··· 1028 1071 p->handle = handle; 1029 1072 #endif 1030 1073 p->addr = addr; 1031 - p->cpu = smp_processor_id(); 1074 + p->cpu = raw_smp_processor_id(); 1032 1075 p->pid = current->pid; 1033 1076 p->when = jiffies; 1034 1077 } ··· 3536 3579 } 3537 3580 3538 3581 /* 3539 - * Try to allocate a partial slab from a specific node. 3582 + * Try to allocate object from a partial slab on a specific node. 3540 3583 */ 3541 - static struct slab *get_partial_node(struct kmem_cache *s, 3542 - struct kmem_cache_node *n, 3543 - struct partial_context *pc) 3584 + static void *get_from_partial_node(struct kmem_cache *s, 3585 + struct kmem_cache_node *n, 3586 + struct partial_context *pc) 3544 3587 { 3545 - struct slab *slab, *slab2, *partial = NULL; 3588 + struct slab *slab, *slab2; 3546 3589 unsigned long flags; 3547 - unsigned int partial_slabs = 0; 3590 + void *object = NULL; 3548 3591 3549 3592 /* 3550 3593 * Racy check. If we mistakenly see no partial slabs then we 3551 3594 * just allocate an empty slab. If we mistakenly try to get a 3552 - * partial slab and there is none available then get_partial() 3595 + * partial slab and there is none available then get_from_partial() 3553 3596 * will return NULL. 3554 3597 */ 3555 3598 if (!n || !n->nr_partial) ··· 3560 3603 else if (!spin_trylock_irqsave(&n->list_lock, flags)) 3561 3604 return NULL; 3562 3605 list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { 3606 + 3607 + struct freelist_counters old, new; 3608 + 3563 3609 if (!pfmemalloc_match(slab, pc->flags)) 3564 3610 continue; 3565 3611 3566 3612 if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 3567 - void *object = alloc_single_from_partial(s, n, slab, 3613 + object = alloc_single_from_partial(s, n, slab, 3568 3614 pc->orig_size); 3569 - if (object) { 3570 - partial = slab; 3571 - pc->object = object; 3615 + if (object) 3572 3616 break; 3573 - } 3574 3617 continue; 3575 3618 } 3576 3619 3577 - remove_partial(n, slab); 3620 + /* 3621 + * get a single object from the slab. This might race against 3622 + * __slab_free(), which however has to take the list_lock if 3623 + * it's about to make the slab fully free. 3624 + */ 3625 + do { 3626 + old.freelist = slab->freelist; 3627 + old.counters = slab->counters; 3578 3628 3579 - if (!partial) { 3580 - partial = slab; 3581 - stat(s, ALLOC_FROM_PARTIAL); 3629 + new.freelist = get_freepointer(s, old.freelist); 3630 + new.counters = old.counters; 3631 + new.inuse++; 3582 3632 3583 - if ((slub_get_cpu_partial(s) == 0)) { 3584 - break; 3585 - } 3586 - } else { 3587 - put_cpu_partial(s, slab, 0); 3588 - stat(s, CPU_PARTIAL_NODE); 3633 + } while (!__slab_update_freelist(s, slab, &old, &new, "get_from_partial_node")); 3589 3634 3590 - if (++partial_slabs > slub_get_cpu_partial(s) / 2) { 3591 - break; 3592 - } 3593 - } 3635 + object = old.freelist; 3636 + if (!new.freelist) 3637 + remove_partial(n, slab); 3638 + 3639 + break; 3594 3640 } 3595 3641 spin_unlock_irqrestore(&n->list_lock, flags); 3596 - return partial; 3642 + return object; 3597 3643 } 3598 3644 3599 3645 /* 3600 - * Get a slab from somewhere. Search in increasing NUMA distances. 3646 + * Get an object from somewhere. Search in increasing NUMA distances. 3601 3647 */ 3602 - static struct slab *get_any_partial(struct kmem_cache *s, 3603 - struct partial_context *pc) 3648 + static void *get_from_any_partial(struct kmem_cache *s, struct partial_context *pc) 3604 3649 { 3605 3650 #ifdef CONFIG_NUMA 3606 3651 struct zonelist *zonelist; 3607 3652 struct zoneref *z; 3608 3653 struct zone *zone; 3609 3654 enum zone_type highest_zoneidx = gfp_zone(pc->flags); 3610 - struct slab *slab; 3611 3655 unsigned int cpuset_mems_cookie; 3612 3656 3613 3657 /* ··· 3643 3685 3644 3686 if (n && cpuset_zone_allowed(zone, pc->flags) && 3645 3687 n->nr_partial > s->min_partial) { 3646 - slab = get_partial_node(s, n, pc); 3647 - if (slab) { 3688 + 3689 + void *object = get_from_partial_node(s, n, pc); 3690 + 3691 + if (object) { 3648 3692 /* 3649 3693 * Don't check read_mems_allowed_retry() 3650 3694 * here - if mems_allowed was updated in ··· 3654 3694 * between allocation and the cpuset 3655 3695 * update 3656 3696 */ 3657 - return slab; 3697 + return object; 3658 3698 } 3659 3699 } 3660 3700 } ··· 3664 3704 } 3665 3705 3666 3706 /* 3667 - * Get a partial slab, lock it and return it. 3707 + * Get an object from a partial slab 3668 3708 */ 3669 - static struct slab *get_partial(struct kmem_cache *s, int node, 3670 - struct partial_context *pc) 3709 + static void *get_from_partial(struct kmem_cache *s, int node, 3710 + struct partial_context *pc) 3671 3711 { 3672 - struct slab *slab; 3673 3712 int searchnode = node; 3713 + void *object; 3674 3714 3675 3715 if (node == NUMA_NO_NODE) 3676 3716 searchnode = numa_mem_id(); 3677 3717 3678 - slab = get_partial_node(s, get_node(s, searchnode), pc); 3679 - if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) 3680 - return slab; 3718 + object = get_from_partial_node(s, get_node(s, searchnode), pc); 3719 + if (object || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) 3720 + return object; 3681 3721 3682 - return get_any_partial(s, pc); 3722 + return get_from_any_partial(s, pc); 3683 3723 } 3684 3724 3685 3725 #ifdef CONFIG_PREEMPTION ··· 4237 4277 return 0; 4238 4278 } 4239 4279 4240 - /* 4241 - * Check if the objects in a per cpu structure fit numa 4242 - * locality expectations. 4243 - */ 4244 - static inline int node_match(struct slab *slab, int node) 4245 - { 4246 - #ifdef CONFIG_NUMA 4247 - if (node != NUMA_NO_NODE && slab_nid(slab) != node) 4248 - return 0; 4249 - #endif 4250 - return 1; 4251 - } 4252 - 4253 4280 #ifdef CONFIG_SLUB_DEBUG 4254 4281 static int count_free(struct slab *slab) 4255 4282 { ··· 4422 4475 } 4423 4476 4424 4477 /* 4425 - * Check the slab->freelist and either transfer the freelist to the 4426 - * per cpu freelist or deactivate the slab. 4427 - * 4428 - * The slab is still frozen if the return value is not NULL. 4429 - * 4430 - * If this function returns NULL then the slab has been unfrozen. 4431 - */ 4432 - static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) 4433 - { 4434 - struct freelist_counters old, new; 4435 - 4436 - lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); 4437 - 4438 - do { 4439 - old.freelist = slab->freelist; 4440 - old.counters = slab->counters; 4441 - 4442 - new.freelist = NULL; 4443 - new.counters = old.counters; 4444 - 4445 - new.inuse = old.objects; 4446 - new.frozen = old.freelist != NULL; 4447 - 4448 - 4449 - } while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist")); 4450 - 4451 - return old.freelist; 4452 - } 4453 - 4454 - /* 4455 4478 * Get the slab's freelist and do not freeze it. 4456 4479 * 4457 4480 * Assumes the slab is isolated from node partial list and not frozen. ··· 4444 4527 new.inuse = old.objects; 4445 4528 4446 4529 } while (!slab_update_freelist(s, slab, &old, &new, "get_freelist_nofreeze")); 4447 - 4448 - return old.freelist; 4449 - } 4450 - 4451 - /* 4452 - * Freeze the partial slab and return the pointer to the freelist. 4453 - */ 4454 - static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) 4455 - { 4456 - struct freelist_counters old, new; 4457 - 4458 - do { 4459 - old.freelist = slab->freelist; 4460 - old.counters = slab->counters; 4461 - 4462 - new.freelist = NULL; 4463 - new.counters = old.counters; 4464 - VM_BUG_ON(new.frozen); 4465 - 4466 - new.inuse = old.objects; 4467 - new.frozen = 1; 4468 - 4469 - } while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab")); 4470 4530 4471 4531 return old.freelist; 4472 4532 } ··· 4515 4621 } 4516 4622 4517 4623 /* 4518 - * Slow path. The lockless freelist is empty or we need to perform 4519 - * debugging duties. 4624 + * Slow path. We failed to allocate via percpu sheaves or they are not available 4625 + * due to bootstrap or debugging enabled or SLUB_TINY. 4520 4626 * 4521 - * Processing is still very fast if new objects have been freed to the 4522 - * regular freelist. In that case we simply take over the regular freelist 4523 - * as the lockless freelist and zap the regular freelist. 4524 - * 4525 - * If that is not working then we fall back to the partial lists. We take the 4526 - * first element of the freelist as the object to allocate now and move the 4527 - * rest of the freelist to the lockless freelist. 4528 - * 4529 - * And if we were unable to get a new slab from the partial slab lists then 4530 - * we need to allocate a new slab. This is the slowest path since it involves 4531 - * a call to the page allocator and the setup of a new slab. 4532 - * 4533 - * Version of __slab_alloc to use when we know that preemption is 4534 - * already disabled (which is the case for bulk allocation). 4627 + * We try to allocate from partial slab lists and fall back to allocating a new 4628 + * slab. 4535 4629 */ 4536 4630 static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 4537 - unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) 4631 + unsigned long addr, unsigned int orig_size) 4538 4632 { 4539 4633 bool allow_spin = gfpflags_allow_spinning(gfpflags); 4540 - void *freelist; 4634 + void *object; 4541 4635 struct slab *slab; 4542 - unsigned long flags; 4543 4636 struct partial_context pc; 4544 4637 bool try_thisnode = true; 4545 4638 4546 4639 stat(s, ALLOC_SLOWPATH); 4547 - 4548 - reread_slab: 4549 - 4550 - slab = READ_ONCE(c->slab); 4551 - if (!slab) { 4552 - /* 4553 - * if the node is not online or has no normal memory, just 4554 - * ignore the node constraint 4555 - */ 4556 - if (unlikely(node != NUMA_NO_NODE && 4557 - !node_isset(node, slab_nodes))) 4558 - node = NUMA_NO_NODE; 4559 - goto new_slab; 4560 - } 4561 - 4562 - if (unlikely(!node_match(slab, node))) { 4563 - /* 4564 - * same as above but node_match() being false already 4565 - * implies node != NUMA_NO_NODE. 4566 - * 4567 - * We don't strictly honor pfmemalloc and NUMA preferences 4568 - * when !allow_spin because: 4569 - * 4570 - * 1. Most kmalloc() users allocate objects on the local node, 4571 - * so kmalloc_nolock() tries not to interfere with them by 4572 - * deactivating the cpu slab. 4573 - * 4574 - * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause 4575 - * unnecessary slab allocations even when n->partial list 4576 - * is not empty. 4577 - */ 4578 - if (!node_isset(node, slab_nodes) || 4579 - !allow_spin) { 4580 - node = NUMA_NO_NODE; 4581 - } else { 4582 - stat(s, ALLOC_NODE_MISMATCH); 4583 - goto deactivate_slab; 4584 - } 4585 - } 4586 - 4587 - /* 4588 - * By rights, we should be searching for a slab page that was 4589 - * PFMEMALLOC but right now, we are losing the pfmemalloc 4590 - * information when the page leaves the per-cpu allocator 4591 - */ 4592 - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) 4593 - goto deactivate_slab; 4594 - 4595 - /* must check again c->slab in case we got preempted and it changed */ 4596 - local_lock_cpu_slab(s, flags); 4597 - 4598 - if (unlikely(slab != c->slab)) { 4599 - local_unlock_cpu_slab(s, flags); 4600 - goto reread_slab; 4601 - } 4602 - freelist = c->freelist; 4603 - if (freelist) 4604 - goto load_freelist; 4605 - 4606 - freelist = get_freelist(s, slab); 4607 - 4608 - if (!freelist) { 4609 - c->slab = NULL; 4610 - c->tid = next_tid(c->tid); 4611 - local_unlock_cpu_slab(s, flags); 4612 - stat(s, DEACTIVATE_BYPASS); 4613 - goto new_slab; 4614 - } 4615 - 4616 - stat(s, ALLOC_REFILL); 4617 - 4618 - load_freelist: 4619 - 4620 - lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); 4621 - 4622 - /* 4623 - * freelist is pointing to the list of objects to be used. 4624 - * slab is pointing to the slab from which the objects are obtained. 4625 - * That slab must be frozen for per cpu allocations to work. 4626 - */ 4627 - VM_BUG_ON(!c->slab->frozen); 4628 - c->freelist = get_freepointer(s, freelist); 4629 - c->tid = next_tid(c->tid); 4630 - local_unlock_cpu_slab(s, flags); 4631 - return freelist; 4632 - 4633 - deactivate_slab: 4634 - 4635 - local_lock_cpu_slab(s, flags); 4636 - if (slab != c->slab) { 4637 - local_unlock_cpu_slab(s, flags); 4638 - goto reread_slab; 4639 - } 4640 - freelist = c->freelist; 4641 - c->slab = NULL; 4642 - c->freelist = NULL; 4643 - c->tid = next_tid(c->tid); 4644 - local_unlock_cpu_slab(s, flags); 4645 - deactivate_slab(s, slab, freelist); 4646 - 4647 - new_slab: 4648 - 4649 - #ifdef CONFIG_SLUB_CPU_PARTIAL 4650 - while (slub_percpu_partial(c)) { 4651 - local_lock_cpu_slab(s, flags); 4652 - if (unlikely(c->slab)) { 4653 - local_unlock_cpu_slab(s, flags); 4654 - goto reread_slab; 4655 - } 4656 - if (unlikely(!slub_percpu_partial(c))) { 4657 - local_unlock_cpu_slab(s, flags); 4658 - /* we were preempted and partial list got empty */ 4659 - goto new_objects; 4660 - } 4661 - 4662 - slab = slub_percpu_partial(c); 4663 - slub_set_percpu_partial(c, slab); 4664 - 4665 - if (likely(node_match(slab, node) && 4666 - pfmemalloc_match(slab, gfpflags)) || 4667 - !allow_spin) { 4668 - c->slab = slab; 4669 - freelist = get_freelist(s, slab); 4670 - VM_BUG_ON(!freelist); 4671 - stat(s, CPU_PARTIAL_ALLOC); 4672 - goto load_freelist; 4673 - } 4674 - 4675 - local_unlock_cpu_slab(s, flags); 4676 - 4677 - slab->next = NULL; 4678 - __put_partials(s, slab); 4679 - } 4680 - #endif 4681 4640 4682 4641 new_objects: 4683 4642 ··· 4539 4792 * When a preferred node is indicated but no __GFP_THISNODE 4540 4793 * 4541 4794 * 1) try to get a partial slab from target node only by having 4542 - * __GFP_THISNODE in pc.flags for get_partial() 4795 + * __GFP_THISNODE in pc.flags for get_from_partial() 4543 4796 * 2) if 1) failed, try to allocate a new slab from target node with 4544 4797 * GPF_NOWAIT | __GFP_THISNODE opportunistically 4545 4798 * 3) if 2) failed, retry with original gfpflags which will allow 4546 - * get_partial() try partial lists of other nodes before potentially 4547 - * allocating new page from other nodes 4799 + * get_from_partial() try partial lists of other nodes before 4800 + * potentially allocating new page from other nodes 4548 4801 */ 4549 4802 if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) 4550 4803 && try_thisnode)) { ··· 4556 4809 } 4557 4810 4558 4811 pc.orig_size = orig_size; 4559 - slab = get_partial(s, node, &pc); 4560 - if (slab) { 4561 - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 4562 - freelist = pc.object; 4563 - /* 4564 - * For debug caches here we had to go through 4565 - * alloc_single_from_partial() so just store the 4566 - * tracking info and return the object. 4567 - * 4568 - * Due to disabled preemption we need to disallow 4569 - * blocking. The flags are further adjusted by 4570 - * gfp_nested_mask() in stack_depot itself. 4571 - */ 4572 - if (s->flags & SLAB_STORE_USER) 4573 - set_track(s, freelist, TRACK_ALLOC, addr, 4574 - gfpflags & ~(__GFP_DIRECT_RECLAIM)); 4812 + object = get_from_partial(s, node, &pc); 4813 + if (object) 4814 + goto success; 4575 4815 4576 - return freelist; 4577 - } 4578 - 4579 - freelist = freeze_slab(s, slab); 4580 - goto retry_load_slab; 4581 - } 4582 - 4583 - slub_put_cpu_ptr(s->cpu_slab); 4584 4816 slab = new_slab(s, pc.flags, node); 4585 - c = slub_get_cpu_ptr(s->cpu_slab); 4586 4817 4587 4818 if (unlikely(!slab)) { 4588 4819 if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) ··· 4575 4850 stat(s, ALLOC_SLAB); 4576 4851 4577 4852 if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 4578 - freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); 4853 + object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); 4579 4854 4580 - if (unlikely(!freelist)) { 4581 - /* This could cause an endless loop. Fail instead. */ 4582 - if (!allow_spin) 4583 - return NULL; 4584 - goto new_objects; 4585 - } 4855 + if (likely(object)) 4856 + goto success; 4857 + } else { 4858 + alloc_from_new_slab(s, slab, &object, 1, allow_spin); 4586 4859 4587 - if (s->flags & SLAB_STORE_USER) 4588 - set_track(s, freelist, TRACK_ALLOC, addr, 4589 - gfpflags & ~(__GFP_DIRECT_RECLAIM)); 4590 - 4591 - return freelist; 4860 + /* we don't need to check SLAB_STORE_USER here */ 4861 + if (likely(object)) 4862 + return object; 4592 4863 } 4593 4864 4594 - /* 4595 - * No other reference to the slab yet so we can 4596 - * muck around with it freely without cmpxchg 4597 - */ 4598 - freelist = slab->freelist; 4599 - slab->freelist = NULL; 4600 - slab->inuse = slab->objects; 4601 - slab->frozen = 1; 4865 + if (allow_spin) 4866 + goto new_objects; 4602 4867 4603 - inc_slabs_node(s, slab_nid(slab), slab->objects); 4868 + /* This could cause an endless loop. Fail instead. */ 4869 + return NULL; 4604 4870 4605 - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) { 4606 - /* 4607 - * For !pfmemalloc_match() case we don't load freelist so that 4608 - * we don't make further mismatched allocations easier. 4609 - */ 4610 - deactivate_slab(s, slab, get_freepointer(s, freelist)); 4611 - return freelist; 4612 - } 4871 + success: 4872 + if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) 4873 + set_track(s, object, TRACK_ALLOC, addr, gfpflags); 4613 4874 4614 - retry_load_slab: 4615 - 4616 - local_lock_cpu_slab(s, flags); 4617 - if (unlikely(c->slab)) { 4618 - void *flush_freelist = c->freelist; 4619 - struct slab *flush_slab = c->slab; 4620 - 4621 - c->slab = NULL; 4622 - c->freelist = NULL; 4623 - c->tid = next_tid(c->tid); 4624 - 4625 - local_unlock_cpu_slab(s, flags); 4626 - 4627 - if (unlikely(!allow_spin)) { 4628 - /* Reentrant slub cannot take locks, defer */ 4629 - defer_deactivate_slab(flush_slab, flush_freelist); 4630 - } else { 4631 - deactivate_slab(s, flush_slab, flush_freelist); 4632 - } 4633 - 4634 - stat(s, CPUSLAB_FLUSH); 4635 - 4636 - goto retry_load_slab; 4637 - } 4638 - c->slab = slab; 4639 - 4640 - goto load_freelist; 4875 + return object; 4641 4876 } 4877 + 4642 4878 /* 4643 4879 * We disallow kprobes in ___slab_alloc() to prevent reentrance 4644 4880 * ··· 4614 4928 */ 4615 4929 NOKPROBE_SYMBOL(___slab_alloc); 4616 4930 4617 - /* 4618 - * A wrapper for ___slab_alloc() for contexts where preemption is not yet 4619 - * disabled. Compensates for possible cpu changes by refetching the per cpu area 4620 - * pointer. 4621 - */ 4622 - static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 4623 - unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) 4624 - { 4625 - void *p; 4626 - 4627 - #ifdef CONFIG_PREEMPT_COUNT 4628 - /* 4629 - * We may have been preempted and rescheduled on a different 4630 - * cpu before disabling preemption. Need to reload cpu area 4631 - * pointer. 4632 - */ 4633 - c = slub_get_cpu_ptr(s->cpu_slab); 4634 - #endif 4635 - if (unlikely(!gfpflags_allow_spinning(gfpflags))) { 4636 - if (local_lock_is_locked(&s->cpu_slab->lock)) { 4637 - /* 4638 - * EBUSY is an internal signal to kmalloc_nolock() to 4639 - * retry a different bucket. It's not propagated 4640 - * to the caller. 4641 - */ 4642 - p = ERR_PTR(-EBUSY); 4643 - goto out; 4644 - } 4645 - } 4646 - p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); 4647 - out: 4648 - #ifdef CONFIG_PREEMPT_COUNT 4649 - slub_put_cpu_ptr(s->cpu_slab); 4650 - #endif 4651 - return p; 4652 - } 4653 - 4654 4931 static __always_inline void *__slab_alloc_node(struct kmem_cache *s, 4655 4932 gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) 4656 4933 { 4657 - struct kmem_cache_cpu *c; 4658 - struct slab *slab; 4659 - unsigned long tid; 4660 4934 void *object; 4661 - 4662 - redo: 4663 - /* 4664 - * Must read kmem_cache cpu data via this cpu ptr. Preemption is 4665 - * enabled. We may switch back and forth between cpus while 4666 - * reading from one cpu area. That does not matter as long 4667 - * as we end up on the original cpu again when doing the cmpxchg. 4668 - * 4669 - * We must guarantee that tid and kmem_cache_cpu are retrieved on the 4670 - * same cpu. We read first the kmem_cache_cpu pointer and use it to read 4671 - * the tid. If we are preempted and switched to another cpu between the 4672 - * two reads, it's OK as the two are still associated with the same cpu 4673 - * and cmpxchg later will validate the cpu. 4674 - */ 4675 - c = raw_cpu_ptr(s->cpu_slab); 4676 - tid = READ_ONCE(c->tid); 4677 - 4678 - /* 4679 - * Irqless object alloc/free algorithm used here depends on sequence 4680 - * of fetching cpu_slab's data. tid should be fetched before anything 4681 - * on c to guarantee that object and slab associated with previous tid 4682 - * won't be used with current tid. If we fetch tid first, object and 4683 - * slab could be one associated with next tid and our alloc/free 4684 - * request will be failed. In this case, we will retry. So, no problem. 4685 - */ 4686 - barrier(); 4687 - 4688 - /* 4689 - * The transaction ids are globally unique per cpu and per operation on 4690 - * a per cpu queue. Thus they can be guarantee that the cmpxchg_double 4691 - * occurs on the right processor and that there was no operation on the 4692 - * linked list in between. 4693 - */ 4694 - 4695 - object = c->freelist; 4696 - slab = c->slab; 4697 4935 4698 4936 #ifdef CONFIG_NUMA 4699 4937 if (static_branch_unlikely(&strict_numa) && ··· 4627 5017 4628 5018 if (mpol) { 4629 5019 /* 4630 - * Special BIND rule support. If existing slab 5020 + * Special BIND rule support. If the local node 4631 5021 * is in permitted set then do not redirect 4632 5022 * to a particular node. 4633 5023 * Otherwise we apply the memory policy to get 4634 5024 * the node we need to allocate on. 4635 5025 */ 4636 - if (mpol->mode != MPOL_BIND || !slab || 4637 - !node_isset(slab_nid(slab), mpol->nodes)) 4638 - 5026 + if (mpol->mode != MPOL_BIND || 5027 + !node_isset(numa_mem_id(), mpol->nodes)) 4639 5028 node = mempolicy_slab_node(); 4640 5029 } 4641 5030 } 4642 5031 #endif 4643 5032 4644 - if (!USE_LOCKLESS_FAST_PATH() || 4645 - unlikely(!object || !slab || !node_match(slab, node))) { 4646 - object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); 4647 - } else { 4648 - void *next_object = get_freepointer_safe(s, object); 4649 - 4650 - /* 4651 - * The cmpxchg will only match if there was no additional 4652 - * operation and if we are on the right processor. 4653 - * 4654 - * The cmpxchg does the following atomically (without lock 4655 - * semantics!) 4656 - * 1. Relocate first pointer to the current per cpu area. 4657 - * 2. Verify that tid and freelist have not been changed 4658 - * 3. If they were not changed replace tid and freelist 4659 - * 4660 - * Since this is without lock semantics the protection is only 4661 - * against code executing on this cpu *not* from access by 4662 - * other cpus. 4663 - */ 4664 - if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) { 4665 - note_cmpxchg_failure("slab_alloc", s, tid); 4666 - goto redo; 4667 - } 4668 - prefetch_freepointer(s, next_object); 4669 - stat(s, ALLOC_FASTPATH); 4670 - } 5033 + object = ___slab_alloc(s, gfpflags, node, addr, orig_size); 4671 5034 4672 5035 return object; 4673 5036 } ··· 7335 7752 int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, 7336 7753 void **p) 7337 7754 { 7338 - struct kmem_cache_cpu *c; 7339 - unsigned long irqflags; 7340 7755 int i; 7341 7756 7342 7757 /* 7343 - * Drain objects in the per cpu slab, while disabling local 7344 - * IRQs, which protects against PREEMPT and interrupts 7345 - * handlers invoking normal fastpath. 7758 + * TODO: this might be more efficient (if necessary) by reusing 7759 + * __refill_objects() 7346 7760 */ 7347 - c = slub_get_cpu_ptr(s->cpu_slab); 7348 - local_lock_irqsave(&s->cpu_slab->lock, irqflags); 7349 - 7350 7761 for (i = 0; i < size; i++) { 7351 - void *object = c->freelist; 7352 7762 7353 - if (unlikely(!object)) { 7354 - /* 7355 - * We may have removed an object from c->freelist using 7356 - * the fastpath in the previous iteration; in that case, 7357 - * c->tid has not been bumped yet. 7358 - * Since ___slab_alloc() may reenable interrupts while 7359 - * allocating memory, we should bump c->tid now. 7360 - */ 7361 - c->tid = next_tid(c->tid); 7763 + p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_, 7764 + s->object_size); 7765 + if (unlikely(!p[i])) 7766 + goto error; 7362 7767 7363 - local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); 7364 - 7365 - /* 7366 - * Invoking slow path likely have side-effect 7367 - * of re-populating per CPU c->freelist 7368 - */ 7369 - p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, 7370 - _RET_IP_, c, s->object_size); 7371 - if (unlikely(!p[i])) 7372 - goto error; 7373 - 7374 - c = this_cpu_ptr(s->cpu_slab); 7375 - maybe_wipe_obj_freeptr(s, p[i]); 7376 - 7377 - local_lock_irqsave(&s->cpu_slab->lock, irqflags); 7378 - 7379 - continue; /* goto for-loop */ 7380 - } 7381 - c->freelist = get_freepointer(s, object); 7382 - p[i] = object; 7383 7768 maybe_wipe_obj_freeptr(s, p[i]); 7384 - stat(s, ALLOC_FASTPATH); 7385 7769 } 7386 - c->tid = next_tid(c->tid); 7387 - local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); 7388 - slub_put_cpu_ptr(s->cpu_slab); 7389 7770 7390 7771 return i; 7391 7772 7392 7773 error: 7393 - slub_put_cpu_ptr(s->cpu_slab); 7394 7774 __kmem_cache_free_bulk(s, i, p); 7395 7775 return 0; 7396 7776

Configure Feed

Configure Feed