Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm, swap_slots: remove slot cache for freeing path

The slot cache for freeing path is mostly for reducing the overhead of
si->lock. As we have basically eliminated the si->lock usage for freeing
path, it can be removed.

This helps simplify the code, and avoids swap entries from being hold in
cache upon freeing. The delayed freeing of entries have been causing
trouble for further optimizations for zswap [1] and in theory will also
cause more fragmentation, and extra overhead.

Test with build linux kernel showed both performance and fragmentation is
better without the cache:

tiem make -j96 / 768M memcg, 4K pages, 10G ZRAM, avg of 4 test run::
Before:
Sys time: 36047.78, Real time: 472.43
After: (-7.6% sys time, -7.3% real time)
Sys time: 33314.76, Real time: 437.67

time make -j96 / 1152M memcg, 64K mTHP, 10G ZRAM, avg of 4 test run:
Before:
Sys time: 46859.04, Real time: 562.63
hugepages-64kB/stats/swpout: 1783392
hugepages-64kB/stats/swpout_fallback: 240875
After: (-23.3% sys time, -21.3% real time)
Sys time: 35958.87, Real time: 442.69
hugepages-64kB/stats/swpout: 1866267
hugepages-64kB/stats/swpout_fallback: 158330

Sequential SWAP should be also slightly faster, tests didn't show a
measurable difference though, at least no regression:

Swapin 4G zero page on ZRAM (time in us):
Before (avg. 1923756)
1912391 1927023 1927957 1916527 1918263 1914284 1934753 1940813 1921791
After (avg. 1922290):
1919101 1925743 1916810 1917007 1923930 1935152 1917403 1923549 1921913

Link: https://lore.kernel.org/all/CAMgjq7ACohT_uerSz8E_994ZZCv709Zor+43hdmesW_59W1BWw@mail.gmail.com/[1]
Link: https://lkml.kernel.org/r/20250113175732.48099-14-ryncsn@gmail.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Suggested-by: Chris Li <chrisl@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickens <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Kairui Song and committed by
Andrew Morton
4f79384a bae8a4ef

+44 -126
-3
include/linux/swap_slots.h
··· 16 16 swp_entry_t *slots; 17 17 int nr; 18 18 int cur; 19 - spinlock_t free_lock; /* protects slots_ret, n_ret */ 20 - swp_entry_t *slots_ret; 21 19 int n_ret; 22 20 }; 23 21 24 22 void disable_swap_slots_cache_lock(void); 25 23 void reenable_swap_slots_cache_unlock(void); 26 24 void enable_swap_slots_cache(void); 27 - void free_swap_slot(swp_entry_t entry); 28 25 29 26 extern bool swap_slot_cache_enabled; 30 27
+10 -68
mm/swap_slots.c
··· 43 43 /* Serialize swap slots cache enable/disable operations */ 44 44 static DEFINE_MUTEX(swap_slots_cache_enable_mutex); 45 45 46 - static void __drain_swap_slots_cache(unsigned int type); 46 + static void __drain_swap_slots_cache(void); 47 47 48 48 #define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled) 49 - #define SLOTS_CACHE 0x1 50 - #define SLOTS_CACHE_RET 0x2 51 49 52 50 static void deactivate_swap_slots_cache(void) 53 51 { 54 52 mutex_lock(&swap_slots_cache_mutex); 55 53 swap_slot_cache_active = false; 56 - __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); 54 + __drain_swap_slots_cache(); 57 55 mutex_unlock(&swap_slots_cache_mutex); 58 56 } 59 57 ··· 70 72 if (swap_slot_cache_initialized) { 71 73 /* serialize with cpu hotplug operations */ 72 74 cpus_read_lock(); 73 - __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); 75 + __drain_swap_slots_cache(); 74 76 cpus_read_unlock(); 75 77 } 76 78 } ··· 111 113 static int alloc_swap_slot_cache(unsigned int cpu) 112 114 { 113 115 struct swap_slots_cache *cache; 114 - swp_entry_t *slots, *slots_ret; 116 + swp_entry_t *slots; 115 117 116 118 /* 117 119 * Do allocation outside swap_slots_cache_mutex ··· 123 125 if (!slots) 124 126 return -ENOMEM; 125 127 126 - slots_ret = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t), 127 - GFP_KERNEL); 128 - if (!slots_ret) { 129 - kvfree(slots); 130 - return -ENOMEM; 131 - } 132 - 133 128 mutex_lock(&swap_slots_cache_mutex); 134 129 cache = &per_cpu(swp_slots, cpu); 135 - if (cache->slots || cache->slots_ret) { 130 + if (cache->slots) { 136 131 /* cache already allocated */ 137 132 mutex_unlock(&swap_slots_cache_mutex); 138 133 139 134 kvfree(slots); 140 - kvfree(slots_ret); 141 135 142 136 return 0; 143 137 } 144 138 145 139 if (!cache->lock_initialized) { 146 140 mutex_init(&cache->alloc_lock); 147 - spin_lock_init(&cache->free_lock); 148 141 cache->lock_initialized = true; 149 142 } 150 143 cache->nr = 0; ··· 149 160 */ 150 161 mb(); 151 162 cache->slots = slots; 152 - cache->slots_ret = slots_ret; 153 163 mutex_unlock(&swap_slots_cache_mutex); 154 164 return 0; 155 165 } 156 166 157 - static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, 158 - bool free_slots) 167 + static void drain_slots_cache_cpu(unsigned int cpu, bool free_slots) 159 168 { 160 169 struct swap_slots_cache *cache; 161 - swp_entry_t *slots = NULL; 162 170 163 171 cache = &per_cpu(swp_slots, cpu); 164 - if ((type & SLOTS_CACHE) && cache->slots) { 172 + if (cache->slots) { 165 173 mutex_lock(&cache->alloc_lock); 166 174 swapcache_free_entries(cache->slots + cache->cur, cache->nr); 167 175 cache->cur = 0; ··· 169 183 } 170 184 mutex_unlock(&cache->alloc_lock); 171 185 } 172 - if ((type & SLOTS_CACHE_RET) && cache->slots_ret) { 173 - spin_lock_irq(&cache->free_lock); 174 - swapcache_free_entries(cache->slots_ret, cache->n_ret); 175 - cache->n_ret = 0; 176 - if (free_slots && cache->slots_ret) { 177 - slots = cache->slots_ret; 178 - cache->slots_ret = NULL; 179 - } 180 - spin_unlock_irq(&cache->free_lock); 181 - kvfree(slots); 182 - } 183 186 } 184 187 185 - static void __drain_swap_slots_cache(unsigned int type) 188 + static void __drain_swap_slots_cache(void) 186 189 { 187 190 unsigned int cpu; 188 191 ··· 199 224 * There are no slots on such cpu that need to be drained. 200 225 */ 201 226 for_each_online_cpu(cpu) 202 - drain_slots_cache_cpu(cpu, type, false); 227 + drain_slots_cache_cpu(cpu, false); 203 228 } 204 229 205 230 static int free_slot_cache(unsigned int cpu) 206 231 { 207 232 mutex_lock(&swap_slots_cache_mutex); 208 - drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true); 233 + drain_slots_cache_cpu(cpu, true); 209 234 mutex_unlock(&swap_slots_cache_mutex); 210 235 return 0; 211 236 } ··· 242 267 cache->slots, 0); 243 268 244 269 return cache->nr; 245 - } 246 - 247 - void free_swap_slot(swp_entry_t entry) 248 - { 249 - struct swap_slots_cache *cache; 250 - 251 - /* Large folio swap slot is not covered. */ 252 - zswap_invalidate(entry); 253 - 254 - cache = raw_cpu_ptr(&swp_slots); 255 - if (likely(use_swap_slot_cache && cache->slots_ret)) { 256 - spin_lock_irq(&cache->free_lock); 257 - /* Swap slots cache may be deactivated before acquiring lock */ 258 - if (!use_swap_slot_cache || !cache->slots_ret) { 259 - spin_unlock_irq(&cache->free_lock); 260 - goto direct_free; 261 - } 262 - if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) { 263 - /* 264 - * Return slots to global pool. 265 - * The current swap_map value is SWAP_HAS_CACHE. 266 - * Set it to 0 to indicate it is available for 267 - * allocation in global pool 268 - */ 269 - swapcache_free_entries(cache->slots_ret, cache->n_ret); 270 - cache->n_ret = 0; 271 - } 272 - cache->slots_ret[cache->n_ret++] = entry; 273 - spin_unlock_irq(&cache->free_lock); 274 - } else { 275 - direct_free: 276 - swapcache_free_entries(&entry, 1); 277 - } 278 270 } 279 271 280 272 swp_entry_t folio_alloc_swap(struct folio *folio)
+34 -55
mm/swapfile.c
··· 53 53 static bool swap_count_continued(struct swap_info_struct *, pgoff_t, 54 54 unsigned char); 55 55 static void free_swap_count_continuations(struct swap_info_struct *); 56 - static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, 57 - unsigned int nr_pages); 56 + static void swap_entry_range_free(struct swap_info_struct *si, 57 + struct swap_cluster_info *ci, 58 + swp_entry_t entry, unsigned int nr_pages); 58 59 static void swap_range_alloc(struct swap_info_struct *si, 59 60 unsigned int nr_entries); 60 61 static bool folio_swapcache_freeable(struct folio *folio); 61 62 static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, 62 63 unsigned long offset); 63 - static void unlock_cluster(struct swap_cluster_info *ci); 64 + static inline void unlock_cluster(struct swap_cluster_info *ci); 64 65 65 66 static DEFINE_SPINLOCK(swap_lock); 66 67 static unsigned int nr_swapfiles; ··· 262 261 folio_ref_sub(folio, nr_pages); 263 262 folio_set_dirty(folio); 264 263 265 - /* Only sinple page folio can be backed by zswap */ 266 - if (nr_pages == 1) 267 - zswap_invalidate(entry); 268 - swap_entry_range_free(si, entry, nr_pages); 264 + ci = lock_cluster(si, offset); 265 + swap_entry_range_free(si, ci, entry, nr_pages); 266 + unlock_cluster(ci); 269 267 ret = nr_pages; 270 268 out_unlock: 271 269 folio_unlock(folio); ··· 1128 1128 * Use atomic clear_bit operations only on zeromap instead of non-atomic 1129 1129 * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes. 1130 1130 */ 1131 - for (i = 0; i < nr_entries; i++) 1131 + for (i = 0; i < nr_entries; i++) { 1132 1132 clear_bit(offset + i, si->zeromap); 1133 + zswap_invalidate(swp_entry(si->type, offset + i)); 1134 + } 1133 1135 1134 1136 if (si->flags & SWP_BLKDEV) 1135 1137 swap_slot_free_notify = ··· 1436 1434 1437 1435 ci = lock_cluster(si, offset); 1438 1436 usage = __swap_entry_free_locked(si, offset, 1); 1439 - unlock_cluster(ci); 1440 1437 if (!usage) 1441 - free_swap_slot(entry); 1438 + swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); 1439 + unlock_cluster(ci); 1442 1440 1443 1441 return usage; 1444 1442 } ··· 1466 1464 } 1467 1465 for (i = 0; i < nr; i++) 1468 1466 WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); 1467 + if (!has_cache) 1468 + swap_entry_range_free(si, ci, entry, nr); 1469 1469 unlock_cluster(ci); 1470 1470 1471 - if (!has_cache) { 1472 - for (i = 0; i < nr; i++) 1473 - zswap_invalidate(swp_entry(si->type, offset + i)); 1474 - swap_entry_range_free(si, entry, nr); 1475 - } 1476 1471 return has_cache; 1477 1472 1478 1473 fallback: ··· 1489 1490 * Drop the last HAS_CACHE flag of swap entries, caller have to 1490 1491 * ensure all entries belong to the same cgroup. 1491 1492 */ 1492 - static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, 1493 - unsigned int nr_pages) 1493 + static void swap_entry_range_free(struct swap_info_struct *si, 1494 + struct swap_cluster_info *ci, 1495 + swp_entry_t entry, unsigned int nr_pages) 1494 1496 { 1495 1497 unsigned long offset = swp_offset(entry); 1496 1498 unsigned char *map = si->swap_map + offset; 1497 1499 unsigned char *map_end = map + nr_pages; 1498 - struct swap_cluster_info *ci; 1499 - 1500 - ci = lock_cluster(si, offset); 1501 1500 1502 1501 /* It should never free entries across different clusters */ 1503 1502 VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1)); ··· 1515 1518 free_cluster(si, ci); 1516 1519 else 1517 1520 partial_free_cluster(si, ci); 1518 - unlock_cluster(ci); 1519 1521 } 1520 1522 1521 1523 static void cluster_swap_free_nr(struct swap_info_struct *si, ··· 1522 1526 unsigned char usage) 1523 1527 { 1524 1528 struct swap_cluster_info *ci; 1525 - DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 }; 1526 - int i, nr; 1529 + unsigned long end = offset + nr_pages; 1527 1530 1528 1531 ci = lock_cluster(si, offset); 1529 - while (nr_pages) { 1530 - nr = min(BITS_PER_LONG, nr_pages); 1531 - for (i = 0; i < nr; i++) { 1532 - if (!__swap_entry_free_locked(si, offset + i, usage)) 1533 - bitmap_set(to_free, i, 1); 1534 - } 1535 - if (!bitmap_empty(to_free, BITS_PER_LONG)) { 1536 - unlock_cluster(ci); 1537 - for_each_set_bit(i, to_free, BITS_PER_LONG) 1538 - free_swap_slot(swp_entry(si->type, offset + i)); 1539 - if (nr == nr_pages) 1540 - return; 1541 - bitmap_clear(to_free, 0, BITS_PER_LONG); 1542 - ci = lock_cluster(si, offset); 1543 - } 1544 - offset += nr; 1545 - nr_pages -= nr; 1546 - } 1532 + do { 1533 + if (!__swap_entry_free_locked(si, offset, usage)) 1534 + swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); 1535 + } while (++offset < end); 1547 1536 unlock_cluster(ci); 1548 1537 } 1549 1538 ··· 1569 1588 return; 1570 1589 1571 1590 ci = lock_cluster(si, offset); 1572 - if (size > 1 && swap_is_has_cache(si, offset, size)) { 1573 - unlock_cluster(ci); 1574 - swap_entry_range_free(si, entry, size); 1575 - return; 1576 - } 1577 - for (int i = 0; i < size; i++, entry.val++) { 1578 - if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { 1579 - unlock_cluster(ci); 1580 - free_swap_slot(entry); 1581 - if (i == size - 1) 1582 - return; 1583 - lock_cluster(si, offset); 1591 + if (swap_is_has_cache(si, offset, size)) 1592 + swap_entry_range_free(si, ci, entry, size); 1593 + else { 1594 + for (int i = 0; i < size; i++, entry.val++) { 1595 + if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) 1596 + swap_entry_range_free(si, ci, entry, 1); 1584 1597 } 1585 1598 } 1586 1599 unlock_cluster(ci); ··· 1583 1608 void swapcache_free_entries(swp_entry_t *entries, int n) 1584 1609 { 1585 1610 int i; 1611 + struct swap_cluster_info *ci; 1586 1612 struct swap_info_struct *si = NULL; 1587 1613 1588 1614 if (n <= 0) ··· 1591 1615 1592 1616 for (i = 0; i < n; ++i) { 1593 1617 si = _swap_info_get(entries[i]); 1594 - if (si) 1595 - swap_entry_range_free(si, entries[i], 1); 1618 + if (si) { 1619 + ci = lock_cluster(si, swp_offset(entries[i])); 1620 + swap_entry_range_free(si, ci, entries[i], 1); 1621 + unlock_cluster(ci); 1622 + } 1596 1623 } 1597 1624 } 1598 1625