mm: swap: skip slot cache on freeing for mTHP

Currently when we are freeing mTHP folios from swap cache, we free then
one by one and put each entry into swap slot cache. Slot cache is
designed to reduce the overhead by batching the freeing, but mTHP swap
entries are already continuous so they can be batch freed without it
already, it saves litle overhead, or even increase overhead for larger
mTHP.

What's more, mTHP entries could stay in swap cache for a while.
Contiguous swap entry is an rather rare resource so releasing them
directly can help improve mTHP allocation success rate when under
pressure.

Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-5-cb9c148b9297@kernel.org
Signed-off-by: Kairui Song <kasong@tencent.com>
Reported-by: Barry Song <21cnbao@gmail.com>
Acked-by: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Kairui Song and committed by

Andrew Morton 2 years ago 650975d2 3b2561b5

+26 -33

1 changed file

expand all

swapfile.c

+26 -33

mm/swapfile.c

··· 479 479 } 480 480 481 481 /* 482 - * The cluster ci decreases one usage. If the usage counter becomes 0, 482 + * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0, 483 483 * which means no page in the cluster is in use, we can optionally discard 484 484 * the cluster and add it to free cluster list. 485 485 */ 486 - static void dec_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *ci) 486 + static void dec_cluster_info_page(struct swap_info_struct *p, 487 + struct swap_cluster_info *ci, int nr_pages) 487 488 { 488 489 if (!p->cluster_info) 489 490 return; 490 491 491 - VM_BUG_ON(ci->count == 0); 492 + VM_BUG_ON(ci->count < nr_pages); 492 493 VM_BUG_ON(cluster_is_free(ci)); 493 494 lockdep_assert_held(&p->lock); 494 495 lockdep_assert_held(&ci->lock); 495 - ci->count--; 496 + ci->count -= nr_pages; 496 497 497 498 if (!ci->count) { 498 499 free_cluster(p, ci); ··· 991 990 return n_ret; 992 991 } 993 992 994 - static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) 995 - { 996 - unsigned long offset = idx * SWAPFILE_CLUSTER; 997 - struct swap_cluster_info *ci; 998 - 999 - ci = lock_cluster(si, offset); 1000 - memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); 1001 - ci->count = 0; 1002 - free_cluster(si, ci); 1003 - unlock_cluster(ci); 1004 - swap_range_free(si, offset, SWAPFILE_CLUSTER); 1005 - } 1006 - 1007 993 int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) 1008 994 { 1009 995 int order = swap_entry_order(entry_order); ··· 1249 1261 return usage; 1250 1262 } 1251 1263 1252 - static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) 1264 + /* 1265 + * Drop the last HAS_CACHE flag of swap entries, caller have to 1266 + * ensure all entries belong to the same cgroup. 1267 + */ 1268 + static void swap_entry_range_free(struct swap_info_struct *p, swp_entry_t entry, 1269 + unsigned int nr_pages) 1253 1270 { 1254 - struct swap_cluster_info *ci; 1255 1271 unsigned long offset = swp_offset(entry); 1256 - unsigned char count; 1272 + unsigned char *map = p->swap_map + offset; 1273 + unsigned char *map_end = map + nr_pages; 1274 + struct swap_cluster_info *ci; 1257 1275 1258 1276 ci = lock_cluster(p, offset); 1259 - count = p->swap_map[offset]; 1260 - VM_BUG_ON(count != SWAP_HAS_CACHE); 1261 - p->swap_map[offset] = 0; 1262 - dec_cluster_info_page(p, ci); 1277 + do { 1278 + VM_BUG_ON(*map != SWAP_HAS_CACHE); 1279 + *map = 0; 1280 + } while (++map < map_end); 1281 + dec_cluster_info_page(p, ci, nr_pages); 1263 1282 unlock_cluster(ci); 1264 1283 1265 - mem_cgroup_uncharge_swap(entry, 1); 1266 - swap_range_free(p, offset, 1); 1284 + mem_cgroup_uncharge_swap(entry, nr_pages); 1285 + swap_range_free(p, offset, nr_pages); 1267 1286 } 1268 1287 1269 1288 static void cluster_swap_free_nr(struct swap_info_struct *sis, ··· 1331 1336 void put_swap_folio(struct folio *folio, swp_entry_t entry) 1332 1337 { 1333 1338 unsigned long offset = swp_offset(entry); 1334 - unsigned long idx = offset / SWAPFILE_CLUSTER; 1335 1339 struct swap_cluster_info *ci; 1336 1340 struct swap_info_struct *si; 1337 1341 unsigned char *map; ··· 1343 1349 return; 1344 1350 1345 1351 ci = lock_cluster_or_swap_info(si, offset); 1346 - if (size == SWAPFILE_CLUSTER) { 1352 + if (size > 1) { 1347 1353 map = si->swap_map + offset; 1348 - for (i = 0; i < SWAPFILE_CLUSTER; i++) { 1354 + for (i = 0; i < size; i++) { 1349 1355 val = map[i]; 1350 1356 VM_BUG_ON(!(val & SWAP_HAS_CACHE)); 1351 1357 if (val == SWAP_HAS_CACHE) 1352 1358 free_entries++; 1353 1359 } 1354 - if (free_entries == SWAPFILE_CLUSTER) { 1360 + if (free_entries == size) { 1355 1361 unlock_cluster_or_swap_info(si, ci); 1356 1362 spin_lock(&si->lock); 1357 - mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); 1358 - swap_free_cluster(si, idx); 1363 + swap_entry_range_free(si, entry, size); 1359 1364 spin_unlock(&si->lock); 1360 1365 return; 1361 1366 } ··· 1399 1406 for (i = 0; i < n; ++i) { 1400 1407 p = swap_info_get_cont(entries[i], prev); 1401 1408 if (p) 1402 - swap_entry_free(p, entries[i]); 1409 + swap_entry_range_free(p, entries[i], 1); 1403 1410 prev = p; 1404 1411 } 1405 1412 if (p)

Configure Feed

Configure Feed