mm, swap: use a global swap cluster for non-rotation devices

Non-rotational devices (SSD / ZRAM) can tolerate fragmentation, so the
goal of the SWAP allocator is to avoid contention for clusters. It uses a
per-CPU cluster design, and each CPU will use a different cluster as much
as possible.

However, HDDs are very sensitive to fragmentation, contention is trivial
in comparison. Therefore, we use one global cluster instead. This
ensures that each order will be written to the same cluster as much as
possible, which helps make the I/O more continuous.

This ensures that the performance of the cluster allocator is as good as
that of the old allocator. Tests after this commit compared to those
before this series:

Tested using 'make -j32' with tinyconfig, a 1G memcg limit, and HDD swap:

make -j32 with tinyconfig, using 1G memcg limit and HDD swap:

Before this series:
114.44user 29.11system 39:42.90elapsed 6%CPU (0avgtext+0avgdata 157284maxresident)k
2901232inputs+0outputs (238877major+4227640minor)pagefaults

After this commit:
113.90user 23.81system 38:11.77elapsed 6%CPU (0avgtext+0avgdata 157260maxresident)k
2548728inputs+0outputs (235471major+4238110minor)pagefaults

[ryncsn@gmail.com: check kmalloc() return in setup_clusters]
Link: https://lkml.kernel.org/r/CAMgjq7Au+o04ckHyT=iU-wVx9az=t0B-ZiC5E0bDqNrAtNOP-g@mail.gmail.com
Link: https://lkml.kernel.org/r/20250113175732.48099-13-ryncsn@gmail.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Suggested-by: Chris Li <chrisl@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickens <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Kairui Song and committed by

Andrew Morton 1 year ago bae8a4ef 3f641cf9

+42 -14

2 changed files

expand all

include

linux

swap.h

swapfile.c

include/linux/swap.h

··· 317 317 unsigned int pages; /* total of usable pages of swap */ 318 318 atomic_long_t inuse_pages; /* number of those currently in use */ 319 319 struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ 320 + struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */ 321 + spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ 320 322 struct rb_root swap_extent_root;/* root of the swap extent rbtree */ 321 323 struct block_device *bdev; /* swap device or bdev of swap file */ 322 324 struct file *swap_file; /* seldom referenced */

+40 -14

mm/swapfile.c

··· 820 820 out: 821 821 relocate_cluster(si, ci); 822 822 unlock_cluster(ci); 823 - __this_cpu_write(si->percpu_cluster->next[order], next); 823 + if (si->flags & SWP_SOLIDSTATE) 824 + __this_cpu_write(si->percpu_cluster->next[order], next); 825 + else 826 + si->global_cluster->next[order] = next; 824 827 return found; 825 828 } 826 829 ··· 884 881 struct swap_cluster_info *ci; 885 882 unsigned int offset, found = 0; 886 883 887 - /* Fast path using per CPU cluster */ 888 - local_lock(&si->percpu_cluster->lock); 889 - offset = __this_cpu_read(si->percpu_cluster->next[order]); 884 + if (si->flags & SWP_SOLIDSTATE) { 885 + /* Fast path using per CPU cluster */ 886 + local_lock(&si->percpu_cluster->lock); 887 + offset = __this_cpu_read(si->percpu_cluster->next[order]); 888 + } else { 889 + /* Serialize HDD SWAP allocation for each device. */ 890 + spin_lock(&si->global_cluster_lock); 891 + offset = si->global_cluster->next[order]; 892 + } 893 + 890 894 if (offset) { 891 895 ci = lock_cluster(si, offset); 892 896 /* Cluster could have been used by another order */ ··· 985 975 } 986 976 } 987 977 done: 988 - local_unlock(&si->percpu_cluster->lock); 989 - 978 + if (si->flags & SWP_SOLIDSTATE) 979 + local_unlock(&si->percpu_cluster->lock); 980 + else 981 + spin_unlock(&si->global_cluster_lock); 990 982 return found; 991 983 } 992 984 ··· 2796 2784 mutex_unlock(&swapon_mutex); 2797 2785 free_percpu(p->percpu_cluster); 2798 2786 p->percpu_cluster = NULL; 2787 + kfree(p->global_cluster); 2788 + p->global_cluster = NULL; 2799 2789 vfree(swap_map); 2800 2790 kvfree(zeromap); 2801 2791 kvfree(cluster_info); ··· 3203 3189 for (i = 0; i < nr_clusters; i++) 3204 3190 spin_lock_init(&cluster_info[i].lock); 3205 3191 3206 - si->percpu_cluster = alloc_percpu(struct percpu_cluster); 3207 - if (!si->percpu_cluster) 3208 - goto err_free; 3192 + if (si->flags & SWP_SOLIDSTATE) { 3193 + si->percpu_cluster = alloc_percpu(struct percpu_cluster); 3194 + if (!si->percpu_cluster) 3195 + goto err_free; 3209 3196 3210 - for_each_possible_cpu(cpu) { 3211 - struct percpu_cluster *cluster; 3197 + for_each_possible_cpu(cpu) { 3198 + struct percpu_cluster *cluster; 3212 3199 3213 - cluster = per_cpu_ptr(si->percpu_cluster, cpu); 3200 + cluster = per_cpu_ptr(si->percpu_cluster, cpu); 3201 + for (i = 0; i < SWAP_NR_ORDERS; i++) 3202 + cluster->next[i] = SWAP_ENTRY_INVALID; 3203 + local_lock_init(&cluster->lock); 3204 + } 3205 + } else { 3206 + si->global_cluster = kmalloc(sizeof(*si->global_cluster), 3207 + GFP_KERNEL); 3208 + if (!si->global_cluster) 3209 + goto err_free; 3214 3210 for (i = 0; i < SWAP_NR_ORDERS; i++) 3215 - cluster->next[i] = SWAP_ENTRY_INVALID; 3216 - local_lock_init(&cluster->lock); 3211 + si->global_cluster->next[i] = SWAP_ENTRY_INVALID; 3212 + spin_lock_init(&si->global_cluster_lock); 3217 3213 } 3218 3214 3219 3215 /* ··· 3497 3473 bad_swap: 3498 3474 free_percpu(si->percpu_cluster); 3499 3475 si->percpu_cluster = NULL; 3476 + kfree(si->global_cluster); 3477 + si->global_cluster = NULL; 3500 3478 inode = NULL; 3501 3479 destroy_swap_extents(si); 3502 3480 swap_cgroup_swapoff(si->type);

Configure Feed

Configure Feed