Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm: swap: allow cache reclaim to skip slot cache

Currently we free the reclaimed slots through slot cache even if the slot
is required to be empty immediately. As a result the reclaim caller will
see the slot still occupied even after a successful reclaim, and need to
keep reclaiming until slot cache get flushed. This caused ineffective or
over reclaim when SWAP is under stress.

So introduce a new flag allowing the slot to be emptied bypassing the slot
cache.

[21cnbao@gmail.com: small folios should have nr_pages == 1 but not nr_page == 0]
Link: https://lkml.kernel.org/r/20240805015324.45134-1-21cnbao@gmail.com
Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-6-cb9c148b9297@kernel.org
Signed-off-by: Kairui Song <kasong@tencent.com>
Reported-by: Barry Song <21cnbao@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Kairui Song and committed by
Andrew Morton
862590ac 650975d2

+109 -43
+109 -43
mm/swapfile.c
··· 53 53 static bool swap_count_continued(struct swap_info_struct *, pgoff_t, 54 54 unsigned char); 55 55 static void free_swap_count_continuations(struct swap_info_struct *); 56 + static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, 57 + unsigned int nr_pages); 56 58 static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, 57 59 unsigned int nr_entries); 60 + static bool folio_swapcache_freeable(struct folio *folio); 61 + static struct swap_cluster_info *lock_cluster_or_swap_info( 62 + struct swap_info_struct *si, unsigned long offset); 63 + static void unlock_cluster_or_swap_info(struct swap_info_struct *si, 64 + struct swap_cluster_info *ci); 58 65 59 66 static DEFINE_SPINLOCK(swap_lock); 60 67 static unsigned int nr_swapfiles; ··· 136 129 * corresponding page 137 130 */ 138 131 #define TTRS_UNMAPPED 0x2 139 - /* Reclaim the swap entry if swap is getting full*/ 132 + /* Reclaim the swap entry if swap is getting full */ 140 133 #define TTRS_FULL 0x4 134 + /* Reclaim directly, bypass the slot cache and don't touch device lock */ 135 + #define TTRS_DIRECT 0x8 136 + 137 + static bool swap_is_has_cache(struct swap_info_struct *si, 138 + unsigned long offset, int nr_pages) 139 + { 140 + unsigned char *map = si->swap_map + offset; 141 + unsigned char *map_end = map + nr_pages; 142 + 143 + do { 144 + VM_BUG_ON(!(*map & SWAP_HAS_CACHE)); 145 + if (*map != SWAP_HAS_CACHE) 146 + return false; 147 + } while (++map < map_end); 148 + 149 + return true; 150 + } 141 151 142 152 /* 143 153 * returns number of pages in the folio that backs the swap entry. If positive, ··· 165 141 unsigned long offset, unsigned long flags) 166 142 { 167 143 swp_entry_t entry = swp_entry(si->type, offset); 144 + struct address_space *address_space = swap_address_space(entry); 145 + struct swap_cluster_info *ci; 168 146 struct folio *folio; 169 - int ret = 0; 147 + int ret, nr_pages; 148 + bool need_reclaim; 170 149 171 - folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); 150 + folio = filemap_get_folio(address_space, swap_cache_index(entry)); 172 151 if (IS_ERR(folio)) 173 152 return 0; 153 + 154 + /* offset could point to the middle of a large folio */ 155 + entry = folio->swap; 156 + offset = swp_offset(entry); 157 + nr_pages = folio_nr_pages(folio); 158 + ret = -nr_pages; 159 + 174 160 /* 175 161 * When this function is called from scan_swap_map_slots() and it's 176 162 * called by vmscan.c at reclaiming folios. So we hold a folio lock ··· 188 154 * case and you should use folio_free_swap() with explicit folio_lock() 189 155 * in usual operations. 190 156 */ 191 - if (folio_trylock(folio)) { 192 - if ((flags & TTRS_ANYWAY) || 193 - ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || 194 - ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) 195 - ret = folio_free_swap(folio); 196 - folio_unlock(folio); 157 + if (!folio_trylock(folio)) 158 + goto out; 159 + 160 + need_reclaim = ((flags & TTRS_ANYWAY) || 161 + ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || 162 + ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); 163 + if (!need_reclaim || !folio_swapcache_freeable(folio)) 164 + goto out_unlock; 165 + 166 + /* 167 + * It's safe to delete the folio from swap cache only if the folio's 168 + * swap_map is HAS_CACHE only, which means the slots have no page table 169 + * reference or pending writeback, and can't be allocated to others. 170 + */ 171 + ci = lock_cluster_or_swap_info(si, offset); 172 + need_reclaim = swap_is_has_cache(si, offset, nr_pages); 173 + unlock_cluster_or_swap_info(si, ci); 174 + if (!need_reclaim) 175 + goto out_unlock; 176 + 177 + if (!(flags & TTRS_DIRECT)) { 178 + /* Free through slot cache */ 179 + delete_from_swap_cache(folio); 180 + folio_set_dirty(folio); 181 + ret = nr_pages; 182 + goto out_unlock; 197 183 } 198 - ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio); 184 + 185 + xa_lock_irq(&address_space->i_pages); 186 + __delete_from_swap_cache(folio, entry, NULL); 187 + xa_unlock_irq(&address_space->i_pages); 188 + folio_ref_sub(folio, nr_pages); 189 + folio_set_dirty(folio); 190 + 191 + spin_lock(&si->lock); 192 + /* Only sinple page folio can be backed by zswap */ 193 + if (nr_pages == 1) 194 + zswap_invalidate(entry); 195 + swap_entry_range_free(si, entry, nr_pages); 196 + spin_unlock(&si->lock); 197 + ret = nr_pages; 198 + out_unlock: 199 + folio_unlock(folio); 200 + out: 199 201 folio_put(folio); 200 202 return ret; 201 203 } ··· 965 895 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 966 896 int swap_was_freed; 967 897 spin_unlock(&si->lock); 968 - swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); 898 + swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); 969 899 spin_lock(&si->lock); 970 900 /* entry was freed successfully, try to use this again */ 971 901 if (swap_was_freed > 0) ··· 1403 1333 unsigned long offset = swp_offset(entry); 1404 1334 struct swap_cluster_info *ci; 1405 1335 struct swap_info_struct *si; 1406 - unsigned char *map; 1407 - unsigned int i, free_entries = 0; 1408 - unsigned char val; 1409 1336 int size = 1 << swap_entry_order(folio_order(folio)); 1410 1337 1411 1338 si = _swap_info_get(entry); ··· 1410 1343 return; 1411 1344 1412 1345 ci = lock_cluster_or_swap_info(si, offset); 1413 - if (size > 1) { 1414 - map = si->swap_map + offset; 1415 - for (i = 0; i < size; i++) { 1416 - val = map[i]; 1417 - VM_BUG_ON(!(val & SWAP_HAS_CACHE)); 1418 - if (val == SWAP_HAS_CACHE) 1419 - free_entries++; 1420 - } 1421 - if (free_entries == size) { 1422 - unlock_cluster_or_swap_info(si, ci); 1423 - spin_lock(&si->lock); 1424 - swap_entry_range_free(si, entry, size); 1425 - spin_unlock(&si->lock); 1426 - return; 1427 - } 1346 + if (size > 1 && swap_is_has_cache(si, offset, size)) { 1347 + unlock_cluster_or_swap_info(si, ci); 1348 + spin_lock(&si->lock); 1349 + swap_entry_range_free(si, entry, size); 1350 + spin_unlock(&si->lock); 1351 + return; 1428 1352 } 1429 - for (i = 0; i < size; i++, entry.val++) { 1353 + for (int i = 0; i < size; i++, entry.val++) { 1430 1354 if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { 1431 1355 unlock_cluster_or_swap_info(si, ci); 1432 1356 free_swap_slot(entry); ··· 1577 1519 return swap_page_trans_huge_swapped(si, entry, folio_order(folio)); 1578 1520 } 1579 1521 1580 - /** 1581 - * folio_free_swap() - Free the swap space used for this folio. 1582 - * @folio: The folio to remove. 1583 - * 1584 - * If swap is getting full, or if there are no more mappings of this folio, 1585 - * then call folio_free_swap to free its swap space. 1586 - * 1587 - * Return: true if we were able to release the swap space. 1588 - */ 1589 - bool folio_free_swap(struct folio *folio) 1522 + static bool folio_swapcache_freeable(struct folio *folio) 1590 1523 { 1591 1524 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 1592 1525 1593 1526 if (!folio_test_swapcache(folio)) 1594 1527 return false; 1595 1528 if (folio_test_writeback(folio)) 1596 - return false; 1597 - if (folio_swapped(folio)) 1598 1529 return false; 1599 1530 1600 1531 /* ··· 1602 1555 * to disk so check that here. 1603 1556 */ 1604 1557 if (pm_suspended_storage()) 1558 + return false; 1559 + 1560 + return true; 1561 + } 1562 + 1563 + /** 1564 + * folio_free_swap() - Free the swap space used for this folio. 1565 + * @folio: The folio to remove. 1566 + * 1567 + * If swap is getting full, or if there are no more mappings of this folio, 1568 + * then call folio_free_swap to free its swap space. 1569 + * 1570 + * Return: true if we were able to release the swap space. 1571 + */ 1572 + bool folio_free_swap(struct folio *folio) 1573 + { 1574 + if (!folio_swapcache_freeable(folio)) 1575 + return false; 1576 + if (folio_swapped(folio)) 1605 1577 return false; 1606 1578 1607 1579 delete_from_swap_cache(folio); ··· 1699 1633 * to the next boundary. 1700 1634 */ 1701 1635 nr = __try_to_reclaim_swap(si, offset, 1702 - TTRS_UNMAPPED | TTRS_FULL); 1636 + TTRS_UNMAPPED | TTRS_FULL); 1703 1637 if (nr == 0) 1704 1638 nr = 1; 1705 1639 else if (nr < 0)