Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm: swap: relaim the cached parts that got scanned

This commit implements reclaim during scan for cluster allocator.

Cluster scanning were unable to reuse SWAP_HAS_CACHE slots, which could
result in low allocation success rate or early OOM.

So to ensure maximum allocation success rate, integrate reclaiming with
scanning. If found a range of suitable swap slots but fragmented due to
HAS_CACHE, just try to reclaim the slots.

Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-8-cb9c148b9297@kernel.org
Signed-off-by: Kairui Song <kasong@tencent.com>
Reported-by: Barry Song <21cnbao@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Kairui Song and committed by
Andrew Morton
661383c6 477cb7ba

+112 -33
+1
include/linux/swap.h
··· 301 301 /* list of cluster that contains at least one free slot */ 302 302 struct list_head frag_clusters[SWAP_NR_ORDERS]; 303 303 /* list of cluster that are fragmented or contented */ 304 + unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; 304 305 unsigned int lowest_bit; /* index of first free in swap_map */ 305 306 unsigned int highest_bit; /* index of last free in swap_map */ 306 307 unsigned int pages; /* total of usable pages of swap */
+111 -33
mm/swapfile.c
··· 513 513 VM_BUG_ON(ci->count != 0); 514 514 lockdep_assert_held(&si->lock); 515 515 lockdep_assert_held(&ci->lock); 516 + 517 + if (ci->flags & CLUSTER_FLAG_FRAG) 518 + si->frag_cluster_nr[ci->order]--; 519 + 516 520 /* 517 521 * If the swap is discardable, prepare discard the cluster 518 522 * instead of free it immediately. The cluster will be freed ··· 576 572 577 573 if (!(ci->flags & CLUSTER_FLAG_NONFULL)) { 578 574 VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE); 579 - if (ci->flags & CLUSTER_FLAG_FRAG) 575 + if (ci->flags & CLUSTER_FLAG_FRAG) { 576 + p->frag_cluster_nr[ci->order]--; 580 577 list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]); 581 - else 578 + } else { 582 579 list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]); 580 + } 583 581 ci->flags = CLUSTER_FLAG_NONFULL; 584 582 } 585 583 } 586 584 587 - static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start, 588 - unsigned int nr_pages) 585 + static bool cluster_reclaim_range(struct swap_info_struct *si, 586 + struct swap_cluster_info *ci, 587 + unsigned long start, unsigned long end) 589 588 { 590 - unsigned char *p = si->swap_map + start; 591 - unsigned char *end = p + nr_pages; 589 + unsigned char *map = si->swap_map; 590 + unsigned long offset; 592 591 593 - while (p < end) 594 - if (*p++) 592 + spin_unlock(&ci->lock); 593 + spin_unlock(&si->lock); 594 + 595 + for (offset = start; offset < end; offset++) { 596 + switch (READ_ONCE(map[offset])) { 597 + case 0: 598 + continue; 599 + case SWAP_HAS_CACHE: 600 + if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0) 601 + continue; 602 + goto out; 603 + default: 604 + goto out; 605 + } 606 + } 607 + out: 608 + spin_lock(&si->lock); 609 + spin_lock(&ci->lock); 610 + 611 + /* 612 + * Recheck the range no matter reclaim succeeded or not, the slot 613 + * could have been be freed while we are not holding the lock. 614 + */ 615 + for (offset = start; offset < end; offset++) 616 + if (READ_ONCE(map[offset])) 595 617 return false; 596 618 597 619 return true; 598 620 } 599 621 622 + static bool cluster_scan_range(struct swap_info_struct *si, 623 + struct swap_cluster_info *ci, 624 + unsigned long start, unsigned int nr_pages) 625 + { 626 + unsigned long offset, end = start + nr_pages; 627 + unsigned char *map = si->swap_map; 628 + bool need_reclaim = false; 600 629 601 - static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, 602 - unsigned int start, unsigned char usage, 603 - unsigned int order) 630 + for (offset = start; offset < end; offset++) { 631 + switch (READ_ONCE(map[offset])) { 632 + case 0: 633 + continue; 634 + case SWAP_HAS_CACHE: 635 + if (!vm_swap_full()) 636 + return false; 637 + need_reclaim = true; 638 + continue; 639 + default: 640 + return false; 641 + } 642 + } 643 + 644 + if (need_reclaim) 645 + return cluster_reclaim_range(si, ci, start, end); 646 + 647 + return true; 648 + } 649 + 650 + static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, 651 + unsigned int start, unsigned char usage, 652 + unsigned int order) 604 653 { 605 654 unsigned int nr_pages = 1 << order; 606 655 ··· 672 615 if (ci->count == SWAPFILE_CLUSTER) { 673 616 VM_BUG_ON(!(ci->flags & 674 617 (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG))); 618 + if (ci->flags & CLUSTER_FLAG_FRAG) 619 + si->frag_cluster_nr[ci->order]--; 675 620 list_del(&ci->list); 676 621 ci->flags = 0; 677 622 } ··· 699 640 } 700 641 701 642 while (offset <= end) { 702 - if (cluster_scan_range(si, offset, nr_pages)) { 643 + if (cluster_scan_range(si, ci, offset, nr_pages)) { 703 644 cluster_alloc_range(si, ci, offset, usage, order); 704 645 *foundp = offset; 705 646 if (ci->count == SWAPFILE_CLUSTER) { ··· 727 668 unsigned char usage) 728 669 { 729 670 struct percpu_cluster *cluster; 730 - struct swap_cluster_info *ci, *n; 671 + struct swap_cluster_info *ci; 731 672 unsigned int offset, found = 0; 732 - LIST_HEAD(fraged); 733 673 734 674 new_cluster: 735 675 lockdep_assert_held(&si->lock); ··· 748 690 } 749 691 750 692 if (order < PMD_ORDER) { 751 - list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) { 752 - list_move_tail(&ci->list, &fraged); 693 + unsigned int frags = 0; 694 + 695 + while (!list_empty(&si->nonfull_clusters[order])) { 696 + ci = list_first_entry(&si->nonfull_clusters[order], 697 + struct swap_cluster_info, list); 698 + list_move_tail(&ci->list, &si->frag_clusters[order]); 753 699 ci->flags = CLUSTER_FLAG_FRAG; 700 + si->frag_cluster_nr[order]++; 754 701 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), 755 702 &found, order, usage); 703 + frags++; 756 704 if (found) 757 705 break; 758 706 } 759 707 760 708 if (!found) { 761 - list_for_each_entry_safe(ci, n, &si->frag_clusters[order], list) { 709 + /* 710 + * Nonfull clusters are moved to frag tail if we reached 711 + * here, count them too, don't over scan the frag list. 712 + */ 713 + while (frags < si->frag_cluster_nr[order]) { 714 + ci = list_first_entry(&si->frag_clusters[order], 715 + struct swap_cluster_info, list); 716 + /* 717 + * Rotate the frag list to iterate, they were all failing 718 + * high order allocation or moved here due to per-CPU usage, 719 + * this help keeping usable cluster ahead. 720 + */ 721 + list_move_tail(&ci->list, &si->frag_clusters[order]); 762 722 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), 763 723 &found, order, usage); 724 + frags++; 764 725 if (found) 765 726 break; 766 727 } 767 728 } 768 - 769 - list_splice_tail(&fraged, &si->frag_clusters[order]); 770 729 } 771 730 772 731 if (found) ··· 804 729 805 730 /* Order 0 stealing from higher order */ 806 731 for (int o = 1; o < SWAP_NR_ORDERS; o++) { 807 - if (!list_empty(&si->frag_clusters[o])) { 732 + /* 733 + * Clusters here have at least one usable slots and can't fail order 0 734 + * allocation, but reclaim may drop si->lock and race with another user. 735 + */ 736 + while (!list_empty(&si->frag_clusters[o])) { 808 737 ci = list_first_entry(&si->frag_clusters[o], 809 738 struct swap_cluster_info, list); 810 - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, 811 - 0, usage); 812 - VM_BUG_ON(!found); 813 - goto done; 814 - } 815 - 816 - if (!list_empty(&si->nonfull_clusters[o])) { 817 - ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info, 818 - list); 819 739 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), 820 740 &found, 0, usage); 821 - VM_BUG_ON(!found); 822 - goto done; 741 + if (found) 742 + goto done; 743 + } 744 + 745 + while (!list_empty(&si->nonfull_clusters[o])) { 746 + ci = list_first_entry(&si->nonfull_clusters[o], 747 + struct swap_cluster_info, list); 748 + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), 749 + &found, 0, usage); 750 + if (found) 751 + goto done; 823 752 } 824 753 } 825 - 826 754 done: 827 755 cluster->next[order] = offset; 828 756 return found; ··· 3120 3042 for (i = 0; i < SWAP_NR_ORDERS; i++) { 3121 3043 INIT_LIST_HEAD(&p->nonfull_clusters[i]); 3122 3044 INIT_LIST_HEAD(&p->frag_clusters[i]); 3045 + p->frag_cluster_nr[i] = 0; 3123 3046 } 3124 3047 3125 3048 for (i = 0; i < swap_header->info.nr_badpages; i++) { ··· 3163 3084 3164 3085 if (!cluster_info) 3165 3086 return nr_extents; 3166 - 3167 3087 3168 3088 /* 3169 3089 * Reduce false cache line sharing between cluster_info and