Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'slab/next' of git://git.kernel.org/pub/scm/linux/kernel/git/penberg/linux

Pull slab changes from Pekka Enberg:
"The biggest change is byte-sized freelist indices which reduces slab
freelist memory usage:

https://lkml.org/lkml/2013/12/2/64"

* 'slab/next' of git://git.kernel.org/pub/scm/linux/kernel/git/penberg/linux:
mm: slab/slub: use page->list consistently instead of page->lru
mm/slab.c: cleanup outdated comments and unify variables naming
slab: fix wrongly used macro
slub: fix high order page allocation problem with __GFP_NOFAIL
slab: Make allocations with GFP_ZERO slightly more efficient
slab: make more slab management structure off the slab
slab: introduce byte sized index for the freelist of a slab
slab: restrict the number of objects in a slab
slab: introduce helper functions to get/set free object
slab: factor out calculate nr objects in cache_estimate

+128 -84
+2 -1
include/linux/mm_types.h
··· 124 124 union { 125 125 struct list_head lru; /* Pageout list, eg. active_list 126 126 * protected by zone->lru_lock ! 127 + * Can be used as a generic list 128 + * by the page owner. 127 129 */ 128 130 struct { /* slub per cpu partial pages */ 129 131 struct page *next; /* Next partial slab */ ··· 138 136 #endif 139 137 }; 140 138 141 - struct list_head list; /* slobs list of pages */ 142 139 struct slab *slab_page; /* slab fields */ 143 140 struct rcu_head rcu_head; /* Used by SLAB 144 141 * when destroying via RCU
+11
include/linux/slab.h
··· 242 242 #define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW) 243 243 #endif 244 244 245 + /* 246 + * This restriction comes from byte sized index implementation. 247 + * Page size is normally 2^12 bytes and, in this case, if we want to use 248 + * byte sized index which can represent 2^8 entries, the size of the object 249 + * should be equal or greater to 2^12 / 2^8 = 2^4 = 16. 250 + * If minimum size of kmalloc is less than 16, we use it as minimum object 251 + * size and give up to use byte sized index. 252 + */ 253 + #define SLAB_OBJ_MIN_SIZE (KMALLOC_MIN_SIZE < 16 ? \ 254 + (KMALLOC_MIN_SIZE) : 16) 255 + 245 256 #ifndef CONFIG_SLOB 246 257 extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; 247 258 #ifdef CONFIG_ZONE_DMA
+107 -76
mm/slab.c
··· 157 157 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 158 158 #endif 159 159 160 + #define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \ 161 + <= SLAB_OBJ_MIN_SIZE) ? 1 : 0) 162 + 163 + #if FREELIST_BYTE_INDEX 164 + typedef unsigned char freelist_idx_t; 165 + #else 166 + typedef unsigned short freelist_idx_t; 167 + #endif 168 + 169 + #define SLAB_OBJ_MAX_NUM (1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) 170 + 160 171 /* 161 172 * true if a page was allocated from pfmemalloc reserves for network-based 162 173 * swap ··· 288 277 * OTOH the cpuarrays can contain lots of objects, 289 278 * which could lock up otherwise freeable slabs. 290 279 */ 291 - #define REAPTIMEOUT_CPUC (2*HZ) 292 - #define REAPTIMEOUT_LIST3 (4*HZ) 280 + #define REAPTIMEOUT_AC (2*HZ) 281 + #define REAPTIMEOUT_NODE (4*HZ) 293 282 294 283 #if STATS 295 284 #define STATS_INC_ACTIVE(x) ((x)->num_active++) ··· 576 565 return cachep->array[smp_processor_id()]; 577 566 } 578 567 579 - static size_t slab_mgmt_size(size_t nr_objs, size_t align) 568 + static int calculate_nr_objs(size_t slab_size, size_t buffer_size, 569 + size_t idx_size, size_t align) 580 570 { 581 - return ALIGN(nr_objs * sizeof(unsigned int), align); 571 + int nr_objs; 572 + size_t freelist_size; 573 + 574 + /* 575 + * Ignore padding for the initial guess. The padding 576 + * is at most @align-1 bytes, and @buffer_size is at 577 + * least @align. In the worst case, this result will 578 + * be one greater than the number of objects that fit 579 + * into the memory allocation when taking the padding 580 + * into account. 581 + */ 582 + nr_objs = slab_size / (buffer_size + idx_size); 583 + 584 + /* 585 + * This calculated number will be either the right 586 + * amount, or one greater than what we want. 587 + */ 588 + freelist_size = slab_size - nr_objs * buffer_size; 589 + if (freelist_size < ALIGN(nr_objs * idx_size, align)) 590 + nr_objs--; 591 + 592 + return nr_objs; 582 593 } 583 594 584 595 /* ··· 633 600 nr_objs = slab_size / buffer_size; 634 601 635 602 } else { 636 - /* 637 - * Ignore padding for the initial guess. The padding 638 - * is at most @align-1 bytes, and @buffer_size is at 639 - * least @align. In the worst case, this result will 640 - * be one greater than the number of objects that fit 641 - * into the memory allocation when taking the padding 642 - * into account. 643 - */ 644 - nr_objs = (slab_size) / (buffer_size + sizeof(unsigned int)); 645 - 646 - /* 647 - * This calculated number will be either the right 648 - * amount, or one greater than what we want. 649 - */ 650 - if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size 651 - > slab_size) 652 - nr_objs--; 653 - 654 - mgmt_size = slab_mgmt_size(nr_objs, align); 603 + nr_objs = calculate_nr_objs(slab_size, buffer_size, 604 + sizeof(freelist_idx_t), align); 605 + mgmt_size = ALIGN(nr_objs * sizeof(freelist_idx_t), align); 655 606 } 656 607 *num = nr_objs; 657 608 *left_over = slab_size - nr_objs*buffer_size - mgmt_size; ··· 1084 1067 1085 1068 list_for_each_entry(cachep, &slab_caches, list) { 1086 1069 /* 1087 - * Set up the size64 kmemlist for cpu before we can 1070 + * Set up the kmem_cache_node for cpu before we can 1088 1071 * begin anything. Make sure some other cpu on this 1089 1072 * node has not already allocated this 1090 1073 */ ··· 1093 1076 if (!n) 1094 1077 return -ENOMEM; 1095 1078 kmem_cache_node_init(n); 1096 - n->next_reap = jiffies + REAPTIMEOUT_LIST3 + 1097 - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1079 + n->next_reap = jiffies + REAPTIMEOUT_NODE + 1080 + ((unsigned long)cachep) % REAPTIMEOUT_NODE; 1098 1081 1099 1082 /* 1100 - * The l3s don't come and go as CPUs come and 1101 - * go. slab_mutex is sufficient 1083 + * The kmem_cache_nodes don't come and go as CPUs 1084 + * come and go. slab_mutex is sufficient 1102 1085 * protection here. 1103 1086 */ 1104 1087 cachep->node[node] = n; ··· 1423 1406 for_each_online_node(node) { 1424 1407 cachep->node[node] = &init_kmem_cache_node[index + node]; 1425 1408 cachep->node[node]->next_reap = jiffies + 1426 - REAPTIMEOUT_LIST3 + 1427 - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1409 + REAPTIMEOUT_NODE + 1410 + ((unsigned long)cachep) % REAPTIMEOUT_NODE; 1428 1411 } 1429 1412 } 1430 1413 ··· 2027 2010 if (!num) 2028 2011 continue; 2029 2012 2013 + /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */ 2014 + if (num > SLAB_OBJ_MAX_NUM) 2015 + break; 2016 + 2030 2017 if (flags & CFLGS_OFF_SLAB) { 2031 2018 /* 2032 2019 * Max number of objs-per-slab for caches which ··· 2038 2017 * looping condition in cache_grow(). 2039 2018 */ 2040 2019 offslab_limit = size; 2041 - offslab_limit /= sizeof(unsigned int); 2020 + offslab_limit /= sizeof(freelist_idx_t); 2042 2021 2043 2022 if (num > offslab_limit) 2044 2023 break; ··· 2124 2103 } 2125 2104 } 2126 2105 cachep->node[numa_mem_id()]->next_reap = 2127 - jiffies + REAPTIMEOUT_LIST3 + 2128 - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 2106 + jiffies + REAPTIMEOUT_NODE + 2107 + ((unsigned long)cachep) % REAPTIMEOUT_NODE; 2129 2108 2130 2109 cpu_cache_get(cachep)->avail = 0; 2131 2110 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; ··· 2264 2243 * it too early on. Always use on-slab management when 2265 2244 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) 2266 2245 */ 2267 - if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && 2246 + if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init && 2268 2247 !(flags & SLAB_NOLEAKTRACE)) 2269 2248 /* 2270 2249 * Size is large, assume best to place the slab management obj ··· 2273 2252 flags |= CFLGS_OFF_SLAB; 2274 2253 2275 2254 size = ALIGN(size, cachep->align); 2255 + /* 2256 + * We should restrict the number of objects in a slab to implement 2257 + * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. 2258 + */ 2259 + if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) 2260 + size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); 2276 2261 2277 2262 left_over = calculate_slab_order(cachep, size, cachep->align, flags); 2278 2263 ··· 2286 2259 return -E2BIG; 2287 2260 2288 2261 freelist_size = 2289 - ALIGN(cachep->num * sizeof(unsigned int), cachep->align); 2262 + ALIGN(cachep->num * sizeof(freelist_idx_t), cachep->align); 2290 2263 2291 2264 /* 2292 2265 * If the slab has been placed off-slab, and we have enough space then ··· 2299 2272 2300 2273 if (flags & CFLGS_OFF_SLAB) { 2301 2274 /* really off slab. No need for manual alignment */ 2302 - freelist_size = cachep->num * sizeof(unsigned int); 2275 + freelist_size = cachep->num * sizeof(freelist_idx_t); 2303 2276 2304 2277 #ifdef CONFIG_PAGE_POISONING 2305 2278 /* If we're going to use the generic kernel_map_pages() ··· 2327 2300 if (flags & CFLGS_OFF_SLAB) { 2328 2301 cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); 2329 2302 /* 2330 - * This is a possibility for one of the malloc_sizes caches. 2303 + * This is a possibility for one of the kmalloc_{dma,}_caches. 2331 2304 * But since we go off slab only for object size greater than 2332 - * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, 2333 - * this should not happen at all. 2305 + * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created 2306 + * in ascending order,this should not happen at all. 2334 2307 * But leave a BUG_ON for some lucky dude. 2335 2308 */ 2336 2309 BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); ··· 2538 2511 2539 2512 /* 2540 2513 * Get the memory for a slab management obj. 2541 - * For a slab cache when the slab descriptor is off-slab, slab descriptors 2542 - * always come from malloc_sizes caches. The slab descriptor cannot 2543 - * come from the same cache which is getting created because, 2544 - * when we are searching for an appropriate cache for these 2545 - * descriptors in kmem_cache_create, we search through the malloc_sizes array. 2546 - * If we are creating a malloc_sizes cache here it would not be visible to 2547 - * kmem_find_general_cachep till the initialization is complete. 2548 - * Hence we cannot have freelist_cache same as the original cache. 2514 + * 2515 + * For a slab cache when the slab descriptor is off-slab, the 2516 + * slab descriptor can't come from the same cache which is being created, 2517 + * Because if it is the case, that means we defer the creation of 2518 + * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point. 2519 + * And we eventually call down to __kmem_cache_create(), which 2520 + * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one. 2521 + * This is a "chicken-and-egg" problem. 2522 + * 2523 + * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches, 2524 + * which are all initialized during kmem_cache_init(). 2549 2525 */ 2550 2526 static void *alloc_slabmgmt(struct kmem_cache *cachep, 2551 2527 struct page *page, int colour_off, ··· 2572 2542 return freelist; 2573 2543 } 2574 2544 2575 - static inline unsigned int *slab_freelist(struct page *page) 2545 + static inline freelist_idx_t get_free_obj(struct page *page, unsigned char idx) 2576 2546 { 2577 - return (unsigned int *)(page->freelist); 2547 + return ((freelist_idx_t *)page->freelist)[idx]; 2548 + } 2549 + 2550 + static inline void set_free_obj(struct page *page, 2551 + unsigned char idx, freelist_idx_t val) 2552 + { 2553 + ((freelist_idx_t *)(page->freelist))[idx] = val; 2578 2554 } 2579 2555 2580 2556 static void cache_init_objs(struct kmem_cache *cachep, ··· 2625 2589 if (cachep->ctor) 2626 2590 cachep->ctor(objp); 2627 2591 #endif 2628 - slab_freelist(page)[i] = i; 2592 + set_free_obj(page, i, i); 2629 2593 } 2630 2594 } 2631 2595 ··· 2644 2608 { 2645 2609 void *objp; 2646 2610 2647 - objp = index_to_obj(cachep, page, slab_freelist(page)[page->active]); 2611 + objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); 2648 2612 page->active++; 2649 2613 #if DEBUG 2650 2614 WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); ··· 2665 2629 2666 2630 /* Verify double free bug */ 2667 2631 for (i = page->active; i < cachep->num; i++) { 2668 - if (slab_freelist(page)[i] == objnr) { 2632 + if (get_free_obj(page, i) == objnr) { 2669 2633 printk(KERN_ERR "slab: double free detected in cache " 2670 2634 "'%s', objp %p\n", cachep->name, objp); 2671 2635 BUG(); ··· 2673 2637 } 2674 2638 #endif 2675 2639 page->active--; 2676 - slab_freelist(page)[page->active] = objnr; 2640 + set_free_obj(page, page->active, objnr); 2677 2641 } 2678 2642 2679 2643 /* ··· 2922 2886 /* move slabp to correct slabp list: */ 2923 2887 list_del(&page->lru); 2924 2888 if (page->active == cachep->num) 2925 - list_add(&page->list, &n->slabs_full); 2889 + list_add(&page->lru, &n->slabs_full); 2926 2890 else 2927 - list_add(&page->list, &n->slabs_partial); 2891 + list_add(&page->lru, &n->slabs_partial); 2928 2892 } 2929 2893 2930 2894 must_grow: ··· 3281 3245 kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, 3282 3246 flags); 3283 3247 3284 - if (likely(ptr)) 3248 + if (likely(ptr)) { 3285 3249 kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); 3286 - 3287 - if (unlikely((flags & __GFP_ZERO) && ptr)) 3288 - memset(ptr, 0, cachep->object_size); 3250 + if (unlikely(flags & __GFP_ZERO)) 3251 + memset(ptr, 0, cachep->object_size); 3252 + } 3289 3253 3290 3254 return ptr; 3291 3255 } ··· 3346 3310 flags); 3347 3311 prefetchw(objp); 3348 3312 3349 - if (likely(objp)) 3313 + if (likely(objp)) { 3350 3314 kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); 3351 - 3352 - if (unlikely((flags & __GFP_ZERO) && objp)) 3353 - memset(objp, 0, cachep->object_size); 3315 + if (unlikely(flags & __GFP_ZERO)) 3316 + memset(objp, 0, cachep->object_size); 3317 + } 3354 3318 3355 3319 return objp; 3356 3320 } 3357 3321 3358 3322 /* 3359 - * Caller needs to acquire correct kmem_list's list_lock 3323 + * Caller needs to acquire correct kmem_cache_node's list_lock 3360 3324 */ 3361 3325 static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, 3362 3326 int node) ··· 3610 3574 struct kmem_cache *cachep; 3611 3575 void *ret; 3612 3576 3613 - /* If you want to save a few bytes .text space: replace 3614 - * __ with kmem_. 3615 - * Then kmalloc uses the uninlined functions instead of the inline 3616 - * functions. 3617 - */ 3618 3577 cachep = kmalloc_slab(size, flags); 3619 3578 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3620 3579 return cachep; ··· 3701 3670 /* 3702 3671 * This initializes kmem_cache_node or resizes various caches for all nodes. 3703 3672 */ 3704 - static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) 3673 + static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) 3705 3674 { 3706 3675 int node; 3707 3676 struct kmem_cache_node *n; ··· 3757 3726 } 3758 3727 3759 3728 kmem_cache_node_init(n); 3760 - n->next_reap = jiffies + REAPTIMEOUT_LIST3 + 3761 - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 3729 + n->next_reap = jiffies + REAPTIMEOUT_NODE + 3730 + ((unsigned long)cachep) % REAPTIMEOUT_NODE; 3762 3731 n->shared = new_shared; 3763 3732 n->alien = new_alien; 3764 3733 n->free_limit = (1 + nr_cpus_node(node)) * ··· 3844 3813 kfree(ccold); 3845 3814 } 3846 3815 kfree(new); 3847 - return alloc_kmemlist(cachep, gfp); 3816 + return alloc_kmem_cache_node(cachep, gfp); 3848 3817 } 3849 3818 3850 3819 static int do_tune_cpucache(struct kmem_cache *cachep, int limit, ··· 4013 3982 if (time_after(n->next_reap, jiffies)) 4014 3983 goto next; 4015 3984 4016 - n->next_reap = jiffies + REAPTIMEOUT_LIST3; 3985 + n->next_reap = jiffies + REAPTIMEOUT_NODE; 4017 3986 4018 3987 drain_array(searchp, n, n->shared, 0, node); 4019 3988 ··· 4034 4003 next_reap_node(); 4035 4004 out: 4036 4005 /* Set up the next iteration */ 4037 - schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); 4006 + schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); 4038 4007 } 4039 4008 4040 4009 #ifdef CONFIG_SLABINFO ··· 4241 4210 4242 4211 for (j = page->active; j < c->num; j++) { 4243 4212 /* Skip freed item */ 4244 - if (slab_freelist(page)[j] == i) { 4213 + if (get_free_obj(page, j) == i) { 4245 4214 active = false; 4246 4215 break; 4247 4216 }
+5 -5
mm/slob.c
··· 111 111 112 112 static void set_slob_page_free(struct page *sp, struct list_head *list) 113 113 { 114 - list_add(&sp->list, list); 114 + list_add(&sp->lru, list); 115 115 __SetPageSlobFree(sp); 116 116 } 117 117 118 118 static inline void clear_slob_page_free(struct page *sp) 119 119 { 120 - list_del(&sp->list); 120 + list_del(&sp->lru); 121 121 __ClearPageSlobFree(sp); 122 122 } 123 123 ··· 282 282 283 283 spin_lock_irqsave(&slob_lock, flags); 284 284 /* Iterate through each partially free page, try to find room */ 285 - list_for_each_entry(sp, slob_list, list) { 285 + list_for_each_entry(sp, slob_list, lru) { 286 286 #ifdef CONFIG_NUMA 287 287 /* 288 288 * If there's a node specification, search for a partial ··· 296 296 continue; 297 297 298 298 /* Attempt to alloc */ 299 - prev = sp->list.prev; 299 + prev = sp->lru.prev; 300 300 b = slob_page_alloc(sp, size, align); 301 301 if (!b) 302 302 continue; ··· 322 322 spin_lock_irqsave(&slob_lock, flags); 323 323 sp->units = SLOB_UNITS(PAGE_SIZE); 324 324 sp->freelist = b; 325 - INIT_LIST_HEAD(&sp->list); 325 + INIT_LIST_HEAD(&sp->lru); 326 326 set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); 327 327 set_slob_page_free(sp, slob_list); 328 328 b = slob_page_alloc(sp, size, align);
+3 -2
mm/slub.c
··· 1352 1352 page = alloc_slab_page(alloc_gfp, node, oo); 1353 1353 if (unlikely(!page)) { 1354 1354 oo = s->min; 1355 + alloc_gfp = flags; 1355 1356 /* 1356 1357 * Allocation may have failed due to fragmentation. 1357 1358 * Try a lower order alloc if possible 1358 1359 */ 1359 - page = alloc_slab_page(flags, node, oo); 1360 + page = alloc_slab_page(alloc_gfp, node, oo); 1360 1361 1361 1362 if (page) 1362 1363 stat(s, ORDER_FALLBACK); ··· 1367 1366 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { 1368 1367 int pages = 1 << oo_order(oo); 1369 1368 1370 - kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); 1369 + kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); 1371 1370 1372 1371 /* 1373 1372 * Objects from caches that have a constructor don't get