Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm, swap: implement dynamic allocation of swap table

Now swap table is cluster based, which means free clusters can free its
table since no one should modify it.

There could be speculative readers, like swap cache look up, protect them
by making them RCU protected. All swap table should be filled with null
entries before free, so such readers will either see a NULL pointer or a
null filled table being lazy freed.

On allocation, allocate the table when a cluster is used by any order.

This way, we can reduce the memory usage of large swap device
significantly.

This idea to dynamically release unused swap cluster data was initially
suggested by Chris Li while proposing the cluster swap allocator and it
suits the swap table idea very well.

Link: https://lkml.kernel.org/r/20250916160100.31545-15-ryncsn@gmail.com
Co-developed-by: Chris Li <chrisl@kernel.org>
Signed-off-by: Chris Li <chrisl@kernel.org>
Signed-off-by: Kairui Song <kasong@tencent.com>
Suggested-by: Chris Li <chrisl@kernel.org>
Reviewed-by: Barry Song <baohua@kernel.org>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: kernel test robot <oliver.sang@intel.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Kairui Song and committed by
Andrew Morton
07adc4cf 685a17fb

+195 -52
+1 -1
mm/swap.h
··· 36 36 u16 count; 37 37 u8 flags; 38 38 u8 order; 39 - atomic_long_t *table; /* Swap table entries, see mm/swap_table.h */ 39 + atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */ 40 40 struct list_head list; 41 41 }; 42 42
+4 -5
mm/swap_state.c
··· 91 91 struct folio *folio; 92 92 93 93 for (;;) { 94 - swp_tb = __swap_table_get(__swap_entry_to_cluster(entry), 95 - swp_cluster_offset(entry)); 94 + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), 95 + swp_cluster_offset(entry)); 96 96 if (!swp_tb_is_folio(swp_tb)) 97 97 return NULL; 98 98 folio = swp_tb_to_folio(swp_tb); ··· 115 115 { 116 116 unsigned long swp_tb; 117 117 118 - swp_tb = __swap_table_get(__swap_entry_to_cluster(entry), 119 - swp_cluster_offset(entry)); 118 + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), 119 + swp_cluster_offset(entry)); 120 120 if (swp_tb_is_shadow(swp_tb)) 121 121 return swp_tb_to_shadow(swp_tb); 122 - 123 122 return NULL; 124 123 } 125 124
+34 -3
mm/swap_table.h
··· 2 2 #ifndef _MM_SWAP_TABLE_H 3 3 #define _MM_SWAP_TABLE_H 4 4 5 + #include <linux/rcupdate.h> 6 + #include <linux/atomic.h> 5 7 #include "swap.h" 8 + 9 + /* A typical flat array in each cluster as swap table */ 10 + struct swap_table { 11 + atomic_long_t entries[SWAPFILE_CLUSTER]; 12 + }; 6 13 7 14 /* 8 15 * A swap table entry represents the status of a swap slot on a swap ··· 83 76 static inline void __swap_table_set(struct swap_cluster_info *ci, 84 77 unsigned int off, unsigned long swp_tb) 85 78 { 79 + atomic_long_t *table = rcu_dereference_protected(ci->table, true); 80 + 81 + lockdep_assert_held(&ci->lock); 86 82 VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); 87 - atomic_long_set(&ci->table[off], swp_tb); 83 + atomic_long_set(&table[off], swp_tb); 88 84 } 89 85 90 86 static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci, 91 87 unsigned int off, unsigned long swp_tb) 92 88 { 89 + atomic_long_t *table = rcu_dereference_protected(ci->table, true); 90 + 91 + lockdep_assert_held(&ci->lock); 93 92 VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); 94 93 /* Ordering is guaranteed by cluster lock, relax */ 95 - return atomic_long_xchg_relaxed(&ci->table[off], swp_tb); 94 + return atomic_long_xchg_relaxed(&table[off], swp_tb); 96 95 } 97 96 98 97 static inline unsigned long __swap_table_get(struct swap_cluster_info *ci, 99 98 unsigned int off) 100 99 { 100 + atomic_long_t *table; 101 + 101 102 VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); 102 - return atomic_long_read(&ci->table[off]); 103 + table = rcu_dereference_check(ci->table, lockdep_is_held(&ci->lock)); 104 + 105 + return atomic_long_read(&table[off]); 106 + } 107 + 108 + static inline unsigned long swap_table_get(struct swap_cluster_info *ci, 109 + unsigned int off) 110 + { 111 + atomic_long_t *table; 112 + unsigned long swp_tb; 113 + 114 + rcu_read_lock(); 115 + table = rcu_dereference(ci->table); 116 + swp_tb = table ? atomic_long_read(&table[off]) : null_to_swp_tb(); 117 + rcu_read_unlock(); 118 + 119 + return swp_tb; 103 120 } 104 121 #endif
+156 -43
mm/swapfile.c
··· 59 59 static void swap_range_alloc(struct swap_info_struct *si, 60 60 unsigned int nr_entries); 61 61 static bool folio_swapcache_freeable(struct folio *folio); 62 + static void move_cluster(struct swap_info_struct *si, 63 + struct swap_cluster_info *ci, struct list_head *list, 64 + enum swap_cluster_flags new_flags); 62 65 63 66 static DEFINE_SPINLOCK(swap_lock); 64 67 static unsigned int nr_swapfiles; ··· 107 104 static DEFINE_SPINLOCK(swap_avail_lock); 108 105 109 106 struct swap_info_struct *swap_info[MAX_SWAPFILES]; 107 + 108 + static struct kmem_cache *swap_table_cachep; 110 109 111 110 static DEFINE_MUTEX(swapon_mutex); 112 111 ··· 406 401 return info->flags == CLUSTER_FLAG_DISCARD; 407 402 } 408 403 404 + static inline bool cluster_table_is_alloced(struct swap_cluster_info *ci) 405 + { 406 + return rcu_dereference_protected(ci->table, lockdep_is_held(&ci->lock)); 407 + } 408 + 409 409 static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order) 410 410 { 411 411 if (unlikely(ci->flags > CLUSTER_FLAG_USABLE)) 412 + return false; 413 + if (!cluster_table_is_alloced(ci)) 412 414 return false; 413 415 if (!order) 414 416 return true; ··· 434 422 return cluster_index(si, ci) * SWAPFILE_CLUSTER; 435 423 } 436 424 437 - static int swap_cluster_alloc_table(struct swap_cluster_info *ci) 438 - { 439 - WARN_ON(ci->table); 440 - ci->table = kzalloc(sizeof(unsigned long) * SWAPFILE_CLUSTER, GFP_KERNEL); 441 - if (!ci->table) 442 - return -ENOMEM; 443 - return 0; 444 - } 445 - 446 425 static void swap_cluster_free_table(struct swap_cluster_info *ci) 447 426 { 448 427 unsigned int ci_off; 449 - unsigned long swp_tb; 428 + struct swap_table *table; 450 429 451 - if (!ci->table) 452 - return; 430 + /* Only empty cluster's table is allow to be freed */ 431 + lockdep_assert_held(&ci->lock); 432 + VM_WARN_ON_ONCE(!cluster_is_empty(ci)); 433 + for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) 434 + VM_WARN_ON_ONCE(!swp_tb_is_null(__swap_table_get(ci, ci_off))); 435 + table = (void *)rcu_dereference_protected(ci->table, true); 436 + rcu_assign_pointer(ci->table, NULL); 453 437 454 - for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) { 455 - swp_tb = __swap_table_get(ci, ci_off); 456 - if (!swp_tb_is_null(swp_tb)) 457 - pr_err_once("swap: unclean swap space on swapoff: 0x%lx", 458 - swp_tb); 438 + kmem_cache_free(swap_table_cachep, table); 439 + } 440 + 441 + /* 442 + * Allocate swap table for one cluster. Attempt an atomic allocation first, 443 + * then fallback to sleeping allocation. 444 + */ 445 + static struct swap_cluster_info * 446 + swap_cluster_alloc_table(struct swap_info_struct *si, 447 + struct swap_cluster_info *ci) 448 + { 449 + struct swap_table *table; 450 + 451 + /* 452 + * Only cluster isolation from the allocator does table allocation. 453 + * Swap allocator uses percpu clusters and holds the local lock. 454 + */ 455 + lockdep_assert_held(&ci->lock); 456 + lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock); 457 + 458 + /* The cluster must be free and was just isolated from the free list. */ 459 + VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci)); 460 + 461 + table = kmem_cache_zalloc(swap_table_cachep, 462 + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); 463 + if (table) { 464 + rcu_assign_pointer(ci->table, table); 465 + return ci; 459 466 } 460 467 461 - kfree(ci->table); 462 - ci->table = NULL; 468 + /* 469 + * Try a sleep allocation. Each isolated free cluster may cause 470 + * a sleep allocation, but there is a limited number of them, so 471 + * the potential recursive allocation is limited. 472 + */ 473 + spin_unlock(&ci->lock); 474 + if (!(si->flags & SWP_SOLIDSTATE)) 475 + spin_unlock(&si->global_cluster_lock); 476 + local_unlock(&percpu_swap_cluster.lock); 477 + 478 + table = kmem_cache_zalloc(swap_table_cachep, 479 + __GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL); 480 + 481 + /* 482 + * Back to atomic context. We might have migrated to a new CPU with a 483 + * usable percpu cluster. But just keep using the isolated cluster to 484 + * make things easier. Migration indicates a slight change of workload 485 + * so using a new free cluster might not be a bad idea, and the worst 486 + * could happen with ignoring the percpu cluster is fragmentation, 487 + * which is acceptable since this fallback and race is rare. 488 + */ 489 + local_lock(&percpu_swap_cluster.lock); 490 + if (!(si->flags & SWP_SOLIDSTATE)) 491 + spin_lock(&si->global_cluster_lock); 492 + spin_lock(&ci->lock); 493 + 494 + /* Nothing except this helper should touch a dangling empty cluster. */ 495 + if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) { 496 + if (table) 497 + kmem_cache_free(swap_table_cachep, table); 498 + return ci; 499 + } 500 + 501 + if (!table) { 502 + move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); 503 + spin_unlock(&ci->lock); 504 + return NULL; 505 + } 506 + 507 + rcu_assign_pointer(ci->table, table); 508 + return ci; 463 509 } 464 510 465 511 static void move_cluster(struct swap_info_struct *si, ··· 549 479 550 480 static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) 551 481 { 552 - lockdep_assert_held(&ci->lock); 482 + swap_cluster_free_table(ci); 553 483 move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); 554 484 ci->order = 0; 555 485 } ··· 564 494 * this returns NULL for an non-empty list. 565 495 */ 566 496 static struct swap_cluster_info *isolate_lock_cluster( 567 - struct swap_info_struct *si, struct list_head *list) 497 + struct swap_info_struct *si, struct list_head *list, int order) 568 498 { 569 - struct swap_cluster_info *ci, *ret = NULL; 499 + struct swap_cluster_info *ci, *found = NULL; 570 500 571 501 spin_lock(&si->lock); 572 - 573 - if (unlikely(!(si->flags & SWP_WRITEOK))) 574 - goto out; 575 - 576 502 list_for_each_entry(ci, list, list) { 577 503 if (!spin_trylock(&ci->lock)) 578 504 continue; ··· 580 514 581 515 list_del(&ci->list); 582 516 ci->flags = CLUSTER_FLAG_NONE; 583 - ret = ci; 517 + found = ci; 584 518 break; 585 519 } 586 - out: 587 520 spin_unlock(&si->lock); 588 521 589 - return ret; 522 + if (found && !cluster_table_is_alloced(found)) { 523 + /* Only an empty free cluster's swap table can be freed. */ 524 + VM_WARN_ON_ONCE(list != &si->free_clusters); 525 + VM_WARN_ON_ONCE(!cluster_is_empty(found)); 526 + return swap_cluster_alloc_table(si, found); 527 + } 528 + 529 + return found; 590 530 } 591 531 592 532 /* ··· 725 653 * added to free cluster list and its usage counter will be increased by 1. 726 654 * Only used for initialization. 727 655 */ 728 - static void inc_cluster_info_page(struct swap_info_struct *si, 656 + static int inc_cluster_info_page(struct swap_info_struct *si, 729 657 struct swap_cluster_info *cluster_info, unsigned long page_nr) 730 658 { 731 659 unsigned long idx = page_nr / SWAPFILE_CLUSTER; 660 + struct swap_table *table; 732 661 struct swap_cluster_info *ci; 733 662 734 663 ci = cluster_info + idx; 664 + if (!ci->table) { 665 + table = kmem_cache_zalloc(swap_table_cachep, GFP_KERNEL); 666 + if (!table) 667 + return -ENOMEM; 668 + rcu_assign_pointer(ci->table, table); 669 + } 670 + 735 671 ci->count++; 736 672 737 673 VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); 738 674 VM_BUG_ON(ci->flags); 675 + 676 + return 0; 739 677 } 740 678 741 679 static bool cluster_reclaim_range(struct swap_info_struct *si, ··· 927 845 unsigned int found = SWAP_ENTRY_INVALID; 928 846 929 847 do { 930 - struct swap_cluster_info *ci = isolate_lock_cluster(si, list); 848 + struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order); 931 849 unsigned long offset; 932 850 933 851 if (!ci) ··· 952 870 if (force) 953 871 to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; 954 872 955 - while ((ci = isolate_lock_cluster(si, &si->full_clusters))) { 873 + while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) { 956 874 offset = cluster_offset(si, ci); 957 875 end = min(si->max, offset + SWAPFILE_CLUSTER); 958 876 to_scan--; ··· 1100 1018 done: 1101 1019 if (!(si->flags & SWP_SOLIDSTATE)) 1102 1020 spin_unlock(&si->global_cluster_lock); 1021 + 1103 1022 return found; 1104 1023 } 1105 1024 ··· 1968 1885 /* This is called for allocating swap entry, not cache */ 1969 1886 if (get_swap_device_info(si)) { 1970 1887 if (si->flags & SWP_WRITEOK) { 1888 + /* 1889 + * Grab the local lock to be complaint 1890 + * with swap table allocation. 1891 + */ 1892 + local_lock(&percpu_swap_cluster.lock); 1971 1893 offset = cluster_alloc_swap_entry(si, 0, 1); 1894 + local_unlock(&percpu_swap_cluster.lock); 1972 1895 if (offset) { 1973 1896 entry = swp_entry(si->type, offset); 1974 1897 atomic_long_dec(&nr_swap_pages); ··· 2768 2679 static void free_cluster_info(struct swap_cluster_info *cluster_info, 2769 2680 unsigned long maxpages) 2770 2681 { 2682 + struct swap_cluster_info *ci; 2771 2683 int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); 2772 2684 2773 2685 if (!cluster_info) 2774 2686 return; 2775 - for (i = 0; i < nr_clusters; i++) 2776 - swap_cluster_free_table(&cluster_info[i]); 2687 + for (i = 0; i < nr_clusters; i++) { 2688 + ci = cluster_info + i; 2689 + /* Cluster with bad marks count will have a remaining table */ 2690 + spin_lock(&ci->lock); 2691 + if (rcu_dereference_protected(ci->table, true)) { 2692 + ci->count = 0; 2693 + swap_cluster_free_table(ci); 2694 + } 2695 + spin_unlock(&ci->lock); 2696 + } 2777 2697 kvfree(cluster_info); 2778 2698 } 2779 2699 ··· 2818 2720 struct address_space *mapping; 2819 2721 struct inode *inode; 2820 2722 struct filename *pathname; 2723 + unsigned int maxpages; 2821 2724 int err, found = 0; 2822 2725 2823 2726 if (!capable(CAP_SYS_ADMIN)) ··· 2925 2826 p->swap_map = NULL; 2926 2827 zeromap = p->zeromap; 2927 2828 p->zeromap = NULL; 2829 + maxpages = p->max; 2928 2830 cluster_info = p->cluster_info; 2929 - free_cluster_info(cluster_info, p->max); 2930 2831 p->max = 0; 2931 2832 p->cluster_info = NULL; 2932 2833 spin_unlock(&p->lock); ··· 2938 2839 p->global_cluster = NULL; 2939 2840 vfree(swap_map); 2940 2841 kvfree(zeromap); 2842 + free_cluster_info(cluster_info, maxpages); 2941 2843 /* Destroy swap account information */ 2942 2844 swap_cgroup_swapoff(p->type); 2943 2845 ··· 3317 3217 if (!cluster_info) 3318 3218 goto err; 3319 3219 3320 - for (i = 0; i < nr_clusters; i++) { 3220 + for (i = 0; i < nr_clusters; i++) 3321 3221 spin_lock_init(&cluster_info[i].lock); 3322 - if (swap_cluster_alloc_table(&cluster_info[i])) 3323 - goto err_free; 3324 - } 3325 3222 3326 3223 if (!(si->flags & SWP_SOLIDSTATE)) { 3327 3224 si->global_cluster = kmalloc(sizeof(*si->global_cluster), ··· 3337 3240 * See setup_swap_map(): header page, bad pages, 3338 3241 * and the EOF part of the last cluster. 3339 3242 */ 3340 - inc_cluster_info_page(si, cluster_info, 0); 3243 + err = inc_cluster_info_page(si, cluster_info, 0); 3244 + if (err) 3245 + goto err; 3341 3246 for (i = 0; i < swap_header->info.nr_badpages; i++) { 3342 3247 unsigned int page_nr = swap_header->info.badpages[i]; 3343 3248 3344 3249 if (page_nr >= maxpages) 3345 3250 continue; 3346 - inc_cluster_info_page(si, cluster_info, page_nr); 3251 + err = inc_cluster_info_page(si, cluster_info, page_nr); 3252 + if (err) 3253 + goto err; 3347 3254 } 3348 - for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) 3349 - inc_cluster_info_page(si, cluster_info, i); 3255 + for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) { 3256 + err = inc_cluster_info_page(si, cluster_info, i); 3257 + if (err) 3258 + goto err; 3259 + } 3350 3260 3351 3261 INIT_LIST_HEAD(&si->free_clusters); 3352 3262 INIT_LIST_HEAD(&si->full_clusters); ··· 4066 3962 plist_head_init(&swap_avail_heads[nid]); 4067 3963 4068 3964 swapfile_maximum_size = arch_max_swapfile_size(); 3965 + 3966 + /* 3967 + * Once a cluster is freed, it's swap table content is read 3968 + * only, and all swap cache readers (swap_cache_*) verifies 3969 + * the content before use. So it's safe to use RCU slab here. 3970 + */ 3971 + swap_table_cachep = kmem_cache_create("swap_table", 3972 + sizeof(struct swap_table), 3973 + 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL); 4069 3974 4070 3975 #ifdef CONFIG_MIGRATION 4071 3976 if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))