mm, swap: reduce contention on device lock

+247 -188

2 changed files

expand all

include

linux

swap.h

swapfile.c

+2 -1

include/linux/swap.h

··· 290 290 * throughput. 291 291 */ 292 292 struct percpu_cluster { 293 + local_lock_t lock; /* Protect the percpu_cluster above */ 293 294 unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ 294 295 }; 295 296 ··· 313 312 /* list of cluster that contains at least one free slot */ 314 313 struct list_head frag_clusters[SWAP_NR_ORDERS]; 315 314 /* list of cluster that are fragmented or contented */ 316 - unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; 315 + atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; 317 316 unsigned int pages; /* total of usable pages of swap */ 318 317 atomic_long_t inuse_pages; /* number of those currently in use */ 319 318 struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */

+245 -187

mm/swapfile.c

··· 261 261 folio_ref_sub(folio, nr_pages); 262 262 folio_set_dirty(folio); 263 263 264 - spin_lock(&si->lock); 265 264 /* Only sinple page folio can be backed by zswap */ 266 265 if (nr_pages == 1) 267 266 zswap_invalidate(entry); 268 267 swap_entry_range_free(si, entry, nr_pages); 269 - spin_unlock(&si->lock); 270 268 ret = nr_pages; 271 269 out_unlock: 272 270 folio_unlock(folio); ··· 399 401 #endif 400 402 #define LATENCY_LIMIT 256 401 403 402 - static inline bool cluster_is_free(struct swap_cluster_info *info) 404 + static inline bool cluster_is_empty(struct swap_cluster_info *info) 403 405 { 404 - return info->flags == CLUSTER_FLAG_FREE; 406 + return info->count == 0; 407 + } 408 + 409 + static inline bool cluster_is_discard(struct swap_cluster_info *info) 410 + { 411 + return info->flags == CLUSTER_FLAG_DISCARD; 412 + } 413 + 414 + static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order) 415 + { 416 + if (unlikely(ci->flags > CLUSTER_FLAG_USABLE)) 417 + return false; 418 + if (!order) 419 + return true; 420 + return cluster_is_empty(ci) || order == ci->order; 405 421 } 406 422 407 423 static inline unsigned int cluster_index(struct swap_info_struct *si, ··· 453 441 VM_WARN_ON(ci->flags == new_flags); 454 442 455 443 BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX); 444 + lockdep_assert_held(&ci->lock); 456 445 457 - if (ci->flags == CLUSTER_FLAG_NONE) { 446 + spin_lock(&si->lock); 447 + if (ci->flags == CLUSTER_FLAG_NONE) 458 448 list_add_tail(&ci->list, list); 459 - } else { 460 - if (ci->flags == CLUSTER_FLAG_FRAG) { 461 - VM_WARN_ON(!si->frag_cluster_nr[ci->order]); 462 - si->frag_cluster_nr[ci->order]--; 463 - } 449 + else 464 450 list_move_tail(&ci->list, list); 465 - } 451 + spin_unlock(&si->lock); 452 + 453 + if (ci->flags == CLUSTER_FLAG_FRAG) 454 + atomic_long_dec(&si->frag_cluster_nr[ci->order]); 455 + else if (new_flags == CLUSTER_FLAG_FRAG) 456 + atomic_long_inc(&si->frag_cluster_nr[ci->order]); 466 457 ci->flags = new_flags; 467 - if (new_flags == CLUSTER_FLAG_FRAG) 468 - si->frag_cluster_nr[ci->order]++; 469 458 } 470 459 471 460 /* Add a cluster to discard list and schedule it to do discard */ ··· 489 476 490 477 static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) 491 478 { 492 - lockdep_assert_held(&si->lock); 493 479 lockdep_assert_held(&ci->lock); 494 480 move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); 495 481 ci->order = 0; 496 482 } 497 483 498 484 /* 485 + * Isolate and lock the first cluster that is not contented on a list, 486 + * clean its flag before taken off-list. Cluster flag must be in sync 487 + * with list status, so cluster updaters can always know the cluster 488 + * list status without touching si lock. 489 + * 490 + * Note it's possible that all clusters on a list are contented so 491 + * this returns NULL for an non-empty list. 492 + */ 493 + static struct swap_cluster_info *isolate_lock_cluster( 494 + struct swap_info_struct *si, struct list_head *list) 495 + { 496 + struct swap_cluster_info *ci, *ret = NULL; 497 + 498 + spin_lock(&si->lock); 499 + 500 + if (unlikely(!(si->flags & SWP_WRITEOK))) 501 + goto out; 502 + 503 + list_for_each_entry(ci, list, list) { 504 + if (!spin_trylock(&ci->lock)) 505 + continue; 506 + 507 + /* We may only isolate and clear flags of following lists */ 508 + VM_BUG_ON(!ci->flags); 509 + VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE && 510 + ci->flags != CLUSTER_FLAG_FULL); 511 + 512 + list_del(&ci->list); 513 + ci->flags = CLUSTER_FLAG_NONE; 514 + ret = ci; 515 + break; 516 + } 517 + out: 518 + spin_unlock(&si->lock); 519 + 520 + return ret; 521 + } 522 + 523 + /* 499 524 * Doing discard actually. After a cluster discard is finished, the cluster 500 - * will be added to free cluster list. caller should hold si->lock. 501 - */ 502 - static void swap_do_scheduled_discard(struct swap_info_struct *si) 525 + * will be added to free cluster list. Discard cluster is a bit special as 526 + * they don't participate in allocation or reclaim, so clusters marked as 527 + * CLUSTER_FLAG_DISCARD must remain off-list or on discard list. 528 + */ 529 + static bool swap_do_scheduled_discard(struct swap_info_struct *si) 503 530 { 504 531 struct swap_cluster_info *ci; 532 + bool ret = false; 505 533 unsigned int idx; 506 534 535 + spin_lock(&si->lock); 507 536 while (!list_empty(&si->discard_clusters)) { 508 537 ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list); 538 + /* 539 + * Delete the cluster from list to prepare for discard, but keep 540 + * the CLUSTER_FLAG_DISCARD flag, there could be percpu_cluster 541 + * pointing to it, or ran into by relocate_cluster. 542 + */ 509 543 list_del(&ci->list); 510 - /* Must clear flag when taking a cluster off-list */ 511 - ci->flags = CLUSTER_FLAG_NONE; 512 544 idx = cluster_index(si, ci); 513 545 spin_unlock(&si->lock); 514 - 515 546 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, 516 547 SWAPFILE_CLUSTER); 517 548 518 - spin_lock(&si->lock); 519 549 spin_lock(&ci->lock); 520 - __free_cluster(si, ci); 550 + /* 551 + * Discard is done, clear its flags as it's off-list, then 552 + * return the cluster to allocation list. 553 + */ 554 + ci->flags = CLUSTER_FLAG_NONE; 521 555 memset(si->swap_map + idx * SWAPFILE_CLUSTER, 522 556 0, SWAPFILE_CLUSTER); 557 + __free_cluster(si, ci); 523 558 spin_unlock(&ci->lock); 559 + ret = true; 560 + spin_lock(&si->lock); 524 561 } 562 + spin_unlock(&si->lock); 563 + return ret; 525 564 } 526 565 527 566 static void swap_discard_work(struct work_struct *work) ··· 582 517 583 518 si = container_of(work, struct swap_info_struct, discard_work); 584 519 585 - spin_lock(&si->lock); 586 520 swap_do_scheduled_discard(si); 587 - spin_unlock(&si->lock); 588 521 } 589 522 590 523 static void swap_users_ref_free(struct percpu_ref *ref) ··· 593 530 complete(&si->comp); 594 531 } 595 532 533 + /* 534 + * Must be called after freeing if ci->count == 0, moves the cluster to free 535 + * or discard list. 536 + */ 596 537 static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) 597 538 { 598 539 VM_BUG_ON(ci->count != 0); 599 - lockdep_assert_held(&si->lock); 540 + VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE); 600 541 lockdep_assert_held(&ci->lock); 601 542 602 543 /* ··· 615 548 } 616 549 617 550 __free_cluster(si, ci); 551 + } 552 + 553 + /* 554 + * Must be called after freeing if ci->count != 0, moves the cluster to 555 + * nonfull list. 556 + */ 557 + static void partial_free_cluster(struct swap_info_struct *si, 558 + struct swap_cluster_info *ci) 559 + { 560 + VM_BUG_ON(!ci->count || ci->count == SWAPFILE_CLUSTER); 561 + lockdep_assert_held(&ci->lock); 562 + 563 + if (ci->flags != CLUSTER_FLAG_NONFULL) 564 + move_cluster(si, ci, &si->nonfull_clusters[ci->order], 565 + CLUSTER_FLAG_NONFULL); 566 + } 567 + 568 + /* 569 + * Must be called after allocation, moves the cluster to full or frag list. 570 + * Note: allocation doesn't acquire si lock, and may drop the ci lock for 571 + * reclaim, so the cluster could be any where when called. 572 + */ 573 + static void relocate_cluster(struct swap_info_struct *si, 574 + struct swap_cluster_info *ci) 575 + { 576 + lockdep_assert_held(&ci->lock); 577 + 578 + /* Discard cluster must remain off-list or on discard list */ 579 + if (cluster_is_discard(ci)) 580 + return; 581 + 582 + if (!ci->count) { 583 + free_cluster(si, ci); 584 + } else if (ci->count != SWAPFILE_CLUSTER) { 585 + if (ci->flags != CLUSTER_FLAG_FRAG) 586 + move_cluster(si, ci, &si->frag_clusters[ci->order], 587 + CLUSTER_FLAG_FRAG); 588 + } else { 589 + if (ci->flags != CLUSTER_FLAG_FULL) 590 + move_cluster(si, ci, &si->full_clusters, 591 + CLUSTER_FLAG_FULL); 592 + } 618 593 } 619 594 620 595 /* ··· 677 568 VM_BUG_ON(ci->flags); 678 569 } 679 570 680 - /* 681 - * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0, 682 - * which means no page in the cluster is in use, we can optionally discard 683 - * the cluster and add it to free cluster list. 684 - */ 685 - static void dec_cluster_info_page(struct swap_info_struct *si, 686 - struct swap_cluster_info *ci, int nr_pages) 687 - { 688 - VM_BUG_ON(ci->count < nr_pages); 689 - VM_BUG_ON(cluster_is_free(ci)); 690 - lockdep_assert_held(&si->lock); 691 - lockdep_assert_held(&ci->lock); 692 - ci->count -= nr_pages; 693 - 694 - if (!ci->count) { 695 - free_cluster(si, ci); 696 - return; 697 - } 698 - 699 - if (ci->flags != CLUSTER_FLAG_NONFULL) 700 - move_cluster(si, ci, &si->nonfull_clusters[ci->order], 701 - CLUSTER_FLAG_NONFULL); 702 - } 703 - 704 571 static bool cluster_reclaim_range(struct swap_info_struct *si, 705 572 struct swap_cluster_info *ci, 706 573 unsigned long start, unsigned long end) ··· 686 601 int nr_reclaim; 687 602 688 603 spin_unlock(&ci->lock); 689 - spin_unlock(&si->lock); 690 - 691 604 do { 692 605 switch (READ_ONCE(map[offset])) { 693 606 case 0: ··· 703 620 } 704 621 } while (offset < end); 705 622 out: 706 - spin_lock(&si->lock); 707 623 spin_lock(&ci->lock); 708 - 709 624 /* 710 625 * Recheck the range no matter reclaim succeeded or not, the slot 711 626 * could have been be freed while we are not holding the lock. ··· 717 636 718 637 static bool cluster_scan_range(struct swap_info_struct *si, 719 638 struct swap_cluster_info *ci, 720 - unsigned long start, unsigned int nr_pages) 639 + unsigned long start, unsigned int nr_pages, 640 + bool *need_reclaim) 721 641 { 722 642 unsigned long offset, end = start + nr_pages; 723 643 unsigned char *map = si->swap_map; 724 - bool need_reclaim = false; 725 644 726 645 for (offset = start; offset < end; offset++) { 727 646 switch (READ_ONCE(map[offset])) { ··· 730 649 case SWAP_HAS_CACHE: 731 650 if (!vm_swap_full()) 732 651 return false; 733 - need_reclaim = true; 652 + *need_reclaim = true; 734 653 continue; 735 654 default: 736 655 return false; 737 656 } 738 657 } 739 - 740 - if (need_reclaim) 741 - return cluster_reclaim_range(si, ci, start, end); 742 658 743 659 return true; 744 660 } ··· 751 673 if (!(si->flags & SWP_WRITEOK)) 752 674 return false; 753 675 754 - VM_BUG_ON(ci->flags == CLUSTER_FLAG_NONE); 755 - VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE); 756 - 757 - if (cluster_is_free(ci)) { 758 - if (nr_pages < SWAPFILE_CLUSTER) 759 - move_cluster(si, ci, &si->nonfull_clusters[order], 760 - CLUSTER_FLAG_NONFULL); 676 + /* 677 + * The first allocation in a cluster makes the 678 + * cluster exclusive to this order 679 + */ 680 + if (cluster_is_empty(ci)) 761 681 ci->order = order; 762 - } 763 682 764 683 memset(si->swap_map + start, usage, nr_pages); 765 684 swap_range_alloc(si, nr_pages); 766 685 ci->count += nr_pages; 767 - 768 - if (ci->count == SWAPFILE_CLUSTER) 769 - move_cluster(si, ci, &si->full_clusters, CLUSTER_FLAG_FULL); 770 686 771 687 return true; 772 688 } ··· 772 700 unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1); 773 701 unsigned long end = min(start + SWAPFILE_CLUSTER, si->max); 774 702 unsigned int nr_pages = 1 << order; 703 + bool need_reclaim, ret; 775 704 struct swap_cluster_info *ci; 776 705 777 - if (end < nr_pages) 778 - return SWAP_NEXT_INVALID; 779 - end -= nr_pages; 706 + ci = &si->cluster_info[offset / SWAPFILE_CLUSTER]; 707 + lockdep_assert_held(&ci->lock); 780 708 781 - ci = lock_cluster(si, offset); 782 - if (ci->count + nr_pages > SWAPFILE_CLUSTER) { 709 + if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER) { 783 710 offset = SWAP_NEXT_INVALID; 784 - goto done; 711 + goto out; 785 712 } 786 713 787 - while (offset <= end) { 788 - if (cluster_scan_range(si, ci, offset, nr_pages)) { 789 - if (!cluster_alloc_range(si, ci, offset, usage, order)) { 714 + for (end -= nr_pages; offset <= end; offset += nr_pages) { 715 + need_reclaim = false; 716 + if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim)) 717 + continue; 718 + if (need_reclaim) { 719 + ret = cluster_reclaim_range(si, ci, start, end); 720 + /* 721 + * Reclaim drops ci->lock and cluster could be used 722 + * by another order. Not checking flag as off-list 723 + * cluster has no flag set, and change of list 724 + * won't cause fragmentation. 725 + */ 726 + if (!cluster_is_usable(ci, order)) { 790 727 offset = SWAP_NEXT_INVALID; 791 - goto done; 728 + goto out; 792 729 } 793 - *foundp = offset; 794 - if (ci->count == SWAPFILE_CLUSTER) { 795 - offset = SWAP_NEXT_INVALID; 796 - goto done; 797 - } 798 - offset += nr_pages; 799 - break; 730 + if (cluster_is_empty(ci)) 731 + offset = start; 732 + /* Reclaim failed but cluster is usable, try next */ 733 + if (!ret) 734 + continue; 735 + } 736 + if (!cluster_alloc_range(si, ci, offset, usage, order)) { 737 + offset = SWAP_NEXT_INVALID; 738 + goto out; 739 + } 740 + *foundp = offset; 741 + if (ci->count == SWAPFILE_CLUSTER) { 742 + offset = SWAP_NEXT_INVALID; 743 + goto out; 800 744 } 801 745 offset += nr_pages; 746 + break; 802 747 } 803 748 if (offset > end) 804 749 offset = SWAP_NEXT_INVALID; 805 - done: 750 + out: 751 + relocate_cluster(si, ci); 806 752 unlock_cluster(ci); 807 753 return offset; 808 754 } ··· 837 747 if (force) 838 748 to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; 839 749 840 - while (!list_empty(&si->full_clusters)) { 841 - ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list); 842 - list_move_tail(&ci->list, &si->full_clusters); 750 + while ((ci = isolate_lock_cluster(si, &si->full_clusters))) { 843 751 offset = cluster_offset(si, ci); 844 752 end = min(si->max, offset + SWAPFILE_CLUSTER); 845 753 to_scan--; 846 754 847 - spin_unlock(&si->lock); 848 755 while (offset < end) { 849 756 if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { 757 + spin_unlock(&ci->lock); 850 758 nr_reclaim = __try_to_reclaim_swap(si, offset, 851 759 TTRS_ANYWAY | TTRS_DIRECT); 760 + spin_lock(&ci->lock); 852 761 if (nr_reclaim) { 853 762 offset += abs(nr_reclaim); 854 763 continue; ··· 855 766 } 856 767 offset++; 857 768 } 858 - spin_lock(&si->lock); 859 769 770 + unlock_cluster(ci); 860 771 if (to_scan <= 0) 861 772 break; 862 773 } ··· 868 779 869 780 si = container_of(work, struct swap_info_struct, reclaim_work); 870 781 871 - spin_lock(&si->lock); 872 782 swap_reclaim_full_clusters(si, true); 873 - spin_unlock(&si->lock); 874 783 } 875 784 876 785 /* ··· 879 792 static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order, 880 793 unsigned char usage) 881 794 { 882 - struct percpu_cluster *cluster; 883 795 struct swap_cluster_info *ci; 884 796 unsigned int offset, found = 0; 885 797 886 - new_cluster: 887 - lockdep_assert_held(&si->lock); 888 - cluster = this_cpu_ptr(si->percpu_cluster); 889 - offset = cluster->next[order]; 798 + /* Fast path using per CPU cluster */ 799 + local_lock(&si->percpu_cluster->lock); 800 + offset = __this_cpu_read(si->percpu_cluster->next[order]); 890 801 if (offset) { 891 - offset = alloc_swap_scan_cluster(si, offset, &found, order, usage); 802 + ci = lock_cluster(si, offset); 803 + /* Cluster could have been used by another order */ 804 + if (cluster_is_usable(ci, order)) { 805 + if (cluster_is_empty(ci)) 806 + offset = cluster_offset(si, ci); 807 + offset = alloc_swap_scan_cluster(si, offset, &found, 808 + order, usage); 809 + } else { 810 + unlock_cluster(ci); 811 + } 892 812 if (found) 893 813 goto done; 894 814 } 895 815 896 - if (!list_empty(&si->free_clusters)) { 897 - ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list); 898 - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage); 899 - /* 900 - * Either we didn't touch the cluster due to swapoff, 901 - * or the allocation must success. 902 - */ 903 - VM_BUG_ON((si->flags & SWP_WRITEOK) && !found); 904 - goto done; 816 + new_cluster: 817 + ci = isolate_lock_cluster(si, &si->free_clusters); 818 + if (ci) { 819 + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), 820 + &found, order, usage); 821 + if (found) 822 + goto done; 905 823 } 906 824 907 825 /* Try reclaim from full clusters if free clusters list is drained */ ··· 914 822 swap_reclaim_full_clusters(si, false); 915 823 916 824 if (order < PMD_ORDER) { 917 - unsigned int frags = 0; 825 + unsigned int frags = 0, frags_existing; 918 826 919 - while (!list_empty(&si->nonfull_clusters[order])) { 920 - ci = list_first_entry(&si->nonfull_clusters[order], 921 - struct swap_cluster_info, list); 922 - move_cluster(si, ci, &si->frag_clusters[order], CLUSTER_FLAG_FRAG); 827 + while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) { 923 828 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), 924 829 &found, order, usage); 925 - frags++; 926 830 if (found) 927 831 goto done; 832 + /* Clusters failed to allocate are moved to frag_clusters */ 833 + frags++; 928 834 } 929 835 930 - /* 931 - * Nonfull clusters are moved to frag tail if we reached 932 - * here, count them too, don't over scan the frag list. 933 - */ 934 - while (frags < si->frag_cluster_nr[order]) { 935 - ci = list_first_entry(&si->frag_clusters[order], 936 - struct swap_cluster_info, list); 836 + frags_existing = atomic_long_read(&si->frag_cluster_nr[order]); 837 + while (frags < frags_existing && 838 + (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) { 839 + atomic_long_dec(&si->frag_cluster_nr[order]); 937 840 /* 938 - * Rotate the frag list to iterate, they were all failing 939 - * high order allocation or moved here due to per-CPU usage, 940 - * this help keeping usable cluster ahead. 841 + * Rotate the frag list to iterate, they were all 842 + * failing high order allocation or moved here due to 843 + * per-CPU usage, but they could contain newly released 844 + * reclaimable (eg. lazy-freed swap cache) slots. 941 845 */ 942 - list_move_tail(&ci->list, &si->frag_clusters[order]); 943 846 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), 944 847 &found, order, usage); 945 - frags++; 946 848 if (found) 947 849 goto done; 850 + frags++; 948 851 } 949 852 } 950 853 951 - if (!list_empty(&si->discard_clusters)) { 952 - /* 953 - * we don't have free cluster but have some clusters in 954 - * discarding, do discard now and reclaim them, then 955 - * reread cluster_next_cpu since we dropped si->lock 956 - */ 957 - swap_do_scheduled_discard(si); 854 + /* 855 + * We don't have free cluster but have some clusters in 856 + * discarding, do discard now and reclaim them, then 857 + * reread cluster_next_cpu since we dropped si->lock 858 + */ 859 + if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si)) 958 860 goto new_cluster; 959 - } 960 861 961 862 if (order) 962 863 goto done; ··· 960 875 * Clusters here have at least one usable slots and can't fail order 0 961 876 * allocation, but reclaim may drop si->lock and race with another user. 962 877 */ 963 - while (!list_empty(&si->frag_clusters[o])) { 964 - ci = list_first_entry(&si->frag_clusters[o], 965 - struct swap_cluster_info, list); 878 + while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) { 879 + atomic_long_dec(&si->frag_cluster_nr[o]); 966 880 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), 967 - &found, 0, usage); 881 + &found, order, usage); 968 882 if (found) 969 883 goto done; 970 884 } 971 885 972 - while (!list_empty(&si->nonfull_clusters[o])) { 973 - ci = list_first_entry(&si->nonfull_clusters[o], 974 - struct swap_cluster_info, list); 886 + while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) { 975 887 offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), 976 - &found, 0, usage); 888 + &found, order, usage); 977 889 if (found) 978 890 goto done; 979 891 } 980 892 } 981 893 done: 982 - cluster->next[order] = offset; 894 + __this_cpu_write(si->percpu_cluster->next[order], offset); 895 + local_unlock(&si->percpu_cluster->lock); 896 + 983 897 return found; 984 898 } 985 899 ··· 1242 1158 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); 1243 1159 spin_unlock(&swap_avail_lock); 1244 1160 if (get_swap_device_info(si)) { 1245 - spin_lock(&si->lock); 1246 1161 n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, 1247 1162 n_goal, swp_entries, order); 1248 - spin_unlock(&si->lock); 1249 1163 put_swap_device(si); 1250 1164 if (n_ret || size > 1) 1251 1165 goto check_out; 1252 - cond_resched(); 1253 1166 } 1254 1167 1255 1168 spin_lock(&swap_avail_lock); ··· 1459 1378 if (!has_cache) { 1460 1379 for (i = 0; i < nr; i++) 1461 1380 zswap_invalidate(swp_entry(si->type, offset + i)); 1462 - spin_lock(&si->lock); 1463 1381 swap_entry_range_free(si, entry, nr); 1464 - spin_unlock(&si->lock); 1465 1382 } 1466 1383 return has_cache; 1467 1384 ··· 1488 1409 unsigned char *map_end = map + nr_pages; 1489 1410 struct swap_cluster_info *ci; 1490 1411 1412 + /* It should never free entries across different clusters */ 1413 + VM_BUG_ON((offset / SWAPFILE_CLUSTER) != ((offset + nr_pages - 1) / SWAPFILE_CLUSTER)); 1414 + 1491 1415 ci = lock_cluster(si, offset); 1416 + VM_BUG_ON(cluster_is_empty(ci)); 1417 + VM_BUG_ON(ci->count < nr_pages); 1418 + 1419 + ci->count -= nr_pages; 1492 1420 do { 1493 1421 VM_BUG_ON(*map != SWAP_HAS_CACHE); 1494 1422 *map = 0; 1495 1423 } while (++map < map_end); 1496 - dec_cluster_info_page(si, ci, nr_pages); 1497 - unlock_cluster(ci); 1498 1424 1499 1425 mem_cgroup_uncharge_swap(entry, nr_pages); 1500 1426 swap_range_free(si, offset, nr_pages); 1427 + 1428 + if (!ci->count) 1429 + free_cluster(si, ci); 1430 + else 1431 + partial_free_cluster(si, ci); 1432 + unlock_cluster(ci); 1501 1433 } 1502 1434 1503 1435 static void cluster_swap_free_nr(struct swap_info_struct *si, ··· 1580 1490 ci = lock_cluster(si, offset); 1581 1491 if (size > 1 && swap_is_has_cache(si, offset, size)) { 1582 1492 unlock_cluster(ci); 1583 - spin_lock(&si->lock); 1584 1493 swap_entry_range_free(si, entry, size); 1585 - spin_unlock(&si->lock); 1586 1494 return; 1587 1495 } 1588 1496 for (int i = 0; i < size; i++, entry.val++) { ··· 1595 1507 unlock_cluster(ci); 1596 1508 } 1597 1509 1598 - static int swp_entry_cmp(const void *ent1, const void *ent2) 1599 - { 1600 - const swp_entry_t *e1 = ent1, *e2 = ent2; 1601 - 1602 - return (int)swp_type(*e1) - (int)swp_type(*e2); 1603 - } 1604 - 1605 1510 void swapcache_free_entries(swp_entry_t *entries, int n) 1606 1511 { 1607 - struct swap_info_struct *si, *prev; 1608 1512 int i; 1513 + struct swap_info_struct *si = NULL; 1609 1514 1610 1515 if (n <= 0) 1611 1516 return; 1612 1517 1613 - prev = NULL; 1614 - si = NULL; 1615 - 1616 - /* 1617 - * Sort swap entries by swap device, so each lock is only taken once. 1618 - * nr_swapfiles isn't absolutely correct, but the overhead of sort() is 1619 - * so low that it isn't necessary to optimize further. 1620 - */ 1621 - if (nr_swapfiles > 1) 1622 - sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); 1623 1518 for (i = 0; i < n; ++i) { 1624 1519 si = _swap_info_get(entries[i]); 1625 - 1626 - if (si != prev) { 1627 - if (prev != NULL) 1628 - spin_unlock(&prev->lock); 1629 - if (si != NULL) 1630 - spin_lock(&si->lock); 1631 - } 1632 1520 if (si) 1633 1521 swap_entry_range_free(si, entries[i], 1); 1634 - prev = si; 1635 1522 } 1636 - if (si) 1637 - spin_unlock(&si->lock); 1638 1523 } 1639 1524 1640 1525 int __swap_count(swp_entry_t entry) ··· 1860 1799 1861 1800 /* This is called for allocating swap entry, not cache */ 1862 1801 if (get_swap_device_info(si)) { 1863 - spin_lock(&si->lock); 1864 1802 if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0)) 1865 1803 atomic_long_dec(&nr_swap_pages); 1866 - spin_unlock(&si->lock); 1867 1804 put_swap_device(si); 1868 1805 } 1869 1806 fail: ··· 3201 3142 cluster = per_cpu_ptr(si->percpu_cluster, cpu); 3202 3143 for (i = 0; i < SWAP_NR_ORDERS; i++) 3203 3144 cluster->next[i] = SWAP_NEXT_INVALID; 3145 + local_lock_init(&cluster->lock); 3204 3146 } 3205 3147 3206 3148 /* ··· 3225 3165 for (i = 0; i < SWAP_NR_ORDERS; i++) { 3226 3166 INIT_LIST_HEAD(&si->nonfull_clusters[i]); 3227 3167 INIT_LIST_HEAD(&si->frag_clusters[i]); 3228 - si->frag_cluster_nr[i] = 0; 3168 + atomic_long_set(&si->frag_cluster_nr[i], 0); 3229 3169 } 3230 3170 3231 3171 /* ··· 3707 3647 */ 3708 3648 goto outer; 3709 3649 } 3710 - spin_lock(&si->lock); 3711 3650 3712 3651 offset = swp_offset(entry); 3713 3652 ··· 3771 3712 spin_unlock(&si->cont_lock); 3772 3713 out: 3773 3714 unlock_cluster(ci); 3774 - spin_unlock(&si->lock); 3775 3715 put_swap_device(si); 3776 3716 outer: 3777 3717 if (page)

Configure Feed

Configure Feed