Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm: swap: add a adaptive full cluster cache reclaim

Link all full cluster with one full list, and reclaim from it when the
allocation have ran out of all usable clusters.

There are many reason a folio can end up being in the swap cache while
having no swap count reference. So the best way to search for such slots
is still by iterating the swap clusters.

With the list as an LRU, iterating from the oldest cluster and keep them
rotating is a very doable and clean way to free up potentially not inuse
clusters.

When any allocation failure, try reclaim and rotate only one cluster.
This is adaptive for high order allocations they can tolerate fallback.
So this avoids latency, and give the full cluster list an fair chance to
get reclaimed. It release the usage stress for the fallback order 0
allocation or following up high order allocation.

If the swap device is getting very full, reclaim more aggresively to
ensure no OOM will happen. This ensures order 0 heavy workload won't go
OOM as order 0 won't fail if any cluster still have any space.

[ryncsn@gmail.com: fix discard of full cluster]
Link: https://lkml.kernel.org/r/CAMgjq7CWwK75_2Zi5P40K08pk9iqOcuWKL6khu=x4Yg_nXaQag@mail.gmail.com
Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-9-cb9c148b9297@kernel.org
Signed-off-by: Kairui Song <kasong@tencent.com>
Reported-by: Barry Song <21cnbao@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Kairui Song <ryncsn@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Kairui Song and committed by
Andrew Morton
2cacbdfd 661383c6

+57 -13
+2
include/linux/swap.h
··· 260 260 #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ 261 261 #define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ 262 262 #define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */ 263 + #define CLUSTER_FLAG_FULL 8 /* This cluster is on full list */ 263 264 264 265 /* 265 266 * The first page in the swap file is the swap header, which is always marked ··· 298 297 unsigned char *swap_map; /* vmalloc'ed array of usage counts */ 299 298 struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ 300 299 struct list_head free_clusters; /* free clusters list */ 300 + struct list_head full_clusters; /* full clusters list */ 301 301 struct list_head nonfull_clusters[SWAP_NR_ORDERS]; 302 302 /* list of cluster that contains at least one free slot */ 303 303 struct list_head frag_clusters[SWAP_NR_ORDERS];
+55 -13
mm/swapfile.c
··· 440 440 SWAP_MAP_BAD, SWAPFILE_CLUSTER); 441 441 442 442 VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); 443 - if (ci->flags & CLUSTER_FLAG_NONFULL) 444 - list_move_tail(&ci->list, &si->discard_clusters); 445 - else 446 - list_add_tail(&ci->list, &si->discard_clusters); 443 + list_move_tail(&ci->list, &si->discard_clusters); 447 444 ci->flags = 0; 448 445 schedule_work(&si->discard_work); 449 446 } ··· 450 453 lockdep_assert_held(&si->lock); 451 454 lockdep_assert_held(&ci->lock); 452 455 453 - if (ci->flags & CLUSTER_FLAG_NONFULL) 456 + if (ci->flags) 454 457 list_move_tail(&ci->list, &si->free_clusters); 455 458 else 456 459 list_add_tail(&ci->list, &si->free_clusters); ··· 477 480 SWAPFILE_CLUSTER); 478 481 479 482 spin_lock(&si->lock); 480 - 481 483 spin_lock(&ci->lock); 482 484 __free_cluster(si, ci); 483 485 memset(si->swap_map + idx * SWAPFILE_CLUSTER, ··· 572 576 573 577 if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { 574 578 VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); 575 - if (ci->flags & CLUSTER_FLAG_FRAG) { 579 + if (ci->flags & CLUSTER_FLAG_FRAG) 576 580 p->frag_cluster_nr[ci->order]--; 577 - list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]); 578 - } else { 579 - list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); 580 - } 581 + list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]); 581 582 ci->flags = CLUSTER_FLAG_NONFULL; 582 583 } 583 584 } ··· 667 674 (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); 668 675 if (ci->flags & CLUSTER_FLAG_FRAG) 669 676 si->frag_cluster_nr[ci->order]--; 670 - list_del(&ci->list); 671 - ci->flags = 0; 677 + list_move_tail(&ci->list, &si->full_clusters); 678 + ci->flags = CLUSTER_FLAG_FULL; 672 679 } 673 680 } 674 681 ··· 709 716 done: 710 717 unlock_cluster(ci); 711 718 return offset; 719 + } 720 + 721 + static void swap_reclaim_full_clusters(struct swap_info_struct *si) 722 + { 723 + long to_scan = 1; 724 + unsigned long offset, end; 725 + struct swap_cluster_info *ci; 726 + unsigned char *map = si->swap_map; 727 + int nr_reclaim, total_reclaimed = 0; 728 + 729 + if (atomic_long_read(&nr_swap_pages) <= SWAPFILE_CLUSTER) 730 + to_scan = si->inuse_pages / SWAPFILE_CLUSTER; 731 + 732 + while (!list_empty(&si->full_clusters)) { 733 + ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list); 734 + list_move_tail(&ci->list, &si->full_clusters); 735 + offset = cluster_offset(si, ci); 736 + end = min(si->max, offset + SWAPFILE_CLUSTER); 737 + to_scan--; 738 + 739 + while (offset < end) { 740 + if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { 741 + spin_unlock(&si->lock); 742 + nr_reclaim = __try_to_reclaim_swap(si, offset, 743 + TTRS_ANYWAY | TTRS_DIRECT); 744 + spin_lock(&si->lock); 745 + if (nr_reclaim > 0) { 746 + offset += nr_reclaim; 747 + total_reclaimed += nr_reclaim; 748 + continue; 749 + } else if (nr_reclaim < 0) { 750 + offset += -nr_reclaim; 751 + continue; 752 + } 753 + } 754 + offset++; 755 + } 756 + if (to_scan <= 0 || total_reclaimed) 757 + break; 758 + } 712 759 } 713 760 714 761 /* ··· 859 826 goto done; 860 827 } 861 828 } 829 + 862 830 done: 831 + /* Try reclaim from full clusters if device is nearfull */ 832 + if (vm_swap_full() && (!found || (si->pages - si->inuse_pages) < SWAPFILE_CLUSTER)) { 833 + swap_reclaim_full_clusters(si); 834 + if (!found && !order && si->pages != si->inuse_pages) 835 + goto new_cluster; 836 + } 837 + 863 838 cluster->next[order] = offset; 864 839 return found; 865 840 } ··· 3156 3115 nr_good_pages = maxpages - 1; /* omit header page */ 3157 3116 3158 3117 INIT_LIST_HEAD(&p->free_clusters); 3118 + INIT_LIST_HEAD(&p->full_clusters); 3159 3119 INIT_LIST_HEAD(&p->discard_clusters); 3160 3120 3161 3121 for (i = 0; i < SWAP_NR_ORDERS; i++) {