mm, swap: use the swap table to track the swap count

+3 -25

include/linux/swap.h

··· 208 208 SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */ 209 209 SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ 210 210 SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ 211 - SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ 212 211 SWP_BLKDEV = (1 << 6), /* its a block device */ 213 212 SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ 214 213 SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */ ··· 221 222 #define SWAP_CLUSTER_MAX 32UL 222 223 #define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10) 223 224 #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX 224 - 225 - /* Bit flag in swap_map */ 226 - #define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ 227 - 228 - /* Special value in first swap_map */ 229 - #define SWAP_MAP_MAX 0x3e /* Max count */ 230 - #define SWAP_MAP_BAD 0x3f /* Note page is bad */ 231 - 232 - /* Special value in each swap_map continuation */ 233 - #define SWAP_CONT_MAX 0x7f /* Max count */ 234 225 235 226 /* 236 227 * The first page in the swap file is the swap header, which is always marked ··· 253 264 signed short prio; /* swap priority of this type */ 254 265 struct plist_node list; /* entry in swap_active_head */ 255 266 signed char type; /* strange name for an index */ 256 - unsigned int max; /* extent of the swap_map */ 257 - unsigned char *swap_map; /* vmalloc'ed array of usage counts */ 267 + unsigned int max; /* size of this swap device */ 258 268 unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ 259 269 struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ 260 270 struct list_head free_clusters; /* free clusters list */ ··· 272 284 struct completion comp; /* seldom referenced */ 273 285 spinlock_t lock; /* 274 286 * protect map scan related fields like 275 - * swap_map, inuse_pages and all cluster 276 - * lists. other fields are only changed 287 + * inuse_pages and all cluster lists. 288 + * Other fields are only changed 277 289 * at swapon/swapoff, so are protected 278 290 * by swap_lock. changing flags need 279 291 * hold this lock and swap_lock. If 280 292 * both locks need hold, hold swap_lock 281 293 * first. 282 - */ 283 - spinlock_t cont_lock; /* 284 - * protect swap count continuation page 285 - * list. 286 294 */ 287 295 struct work_struct discard_work; /* discard worker */ 288 296 struct work_struct reclaim_work; /* reclaim worker */ ··· 435 451 } 436 452 437 453 extern void si_swapinfo(struct sysinfo *); 438 - extern int add_swap_count_continuation(swp_entry_t, gfp_t); 439 454 int swap_type_of(dev_t device, sector_t offset); 440 455 int find_first_swap(dev_t *device); 441 456 extern unsigned int count_swap_pages(int, int); ··· 498 515 499 516 static inline void free_swap_cache(struct folio *folio) 500 517 { 501 - } 502 - 503 - static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) 504 - { 505 - return 0; 506 518 } 507 519 508 520 static inline int swap_dup_entry_direct(swp_entry_t ent)

+1 -1

mm/memory.c

··· 1346 1346 1347 1347 if (ret == -EIO) { 1348 1348 VM_WARN_ON_ONCE(!entry.val); 1349 - if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { 1349 + if (swap_retry_table_alloc(entry, GFP_KERNEL) < 0) { 1350 1350 ret = -ENOMEM; 1351 1351 goto out; 1352 1352 }

+11 -3

mm/swap.h

··· 37 37 u8 flags; 38 38 u8 order; 39 39 atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */ 40 + unsigned int *extend_table; /* For large swap count, protected by ci->lock */ 40 41 struct list_head list; 41 42 }; 42 43 ··· 184 183 spin_unlock_irq(&ci->lock); 185 184 } 186 185 186 + extern int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp); 187 + 187 188 /* 188 189 * Below are the core routines for doing swap for a folio. 189 190 * All helpers requires the folio to be locked, and a locked folio ··· 209 206 void folio_put_swap(struct folio *folio, struct page *subpage); 210 207 211 208 /* For internal use */ 212 - extern void swap_entries_free(struct swap_info_struct *si, 213 - struct swap_cluster_info *ci, 214 - unsigned long offset, unsigned int nr_pages); 209 + extern void __swap_cluster_free_entries(struct swap_info_struct *si, 210 + struct swap_cluster_info *ci, 211 + unsigned int ci_off, unsigned int nr_pages); 215 212 216 213 /* linux/mm/page_io.c */ 217 214 int sio_pool_init(void); ··· 447 444 struct swap_iocb **swap_plug) 448 445 { 449 446 return 0; 447 + } 448 + 449 + static inline int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp) 450 + { 451 + return -EINVAL; 450 452 } 451 453 452 454 static inline bool swap_cache_has_folio(swp_entry_t entry)

+26 -27

mm/swap_state.c

··· 140 140 void __swap_cache_add_folio(struct swap_cluster_info *ci, 141 141 struct folio *folio, swp_entry_t entry) 142 142 { 143 - unsigned long new_tb; 144 - unsigned int ci_start, ci_off, ci_end; 143 + unsigned int ci_off = swp_cluster_offset(entry), ci_end; 145 144 unsigned long nr_pages = folio_nr_pages(folio); 145 + unsigned long pfn = folio_pfn(folio); 146 + unsigned long old_tb; 146 147 147 148 VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); 148 149 VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); 149 150 VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); 150 151 151 - new_tb = folio_to_swp_tb(folio, 0); 152 - ci_start = swp_cluster_offset(entry); 153 - ci_off = ci_start; 154 - ci_end = ci_start + nr_pages; 152 + ci_end = ci_off + nr_pages; 155 153 do { 156 - VM_WARN_ON_ONCE(swp_tb_is_folio(__swap_table_get(ci, ci_off))); 157 - __swap_table_set(ci, ci_off, new_tb); 154 + old_tb = __swap_table_get(ci, ci_off); 155 + VM_WARN_ON_ONCE(swp_tb_is_folio(old_tb)); 156 + __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); 158 157 } while (++ci_off < ci_end); 159 158 160 159 folio_ref_add(folio, nr_pages); ··· 182 183 unsigned long old_tb; 183 184 struct swap_info_struct *si; 184 185 struct swap_cluster_info *ci; 185 - unsigned int ci_start, ci_off, ci_end, offset; 186 + unsigned int ci_start, ci_off, ci_end; 186 187 unsigned long nr_pages = folio_nr_pages(folio); 187 188 188 189 si = __swap_entry_to_info(entry); 189 190 ci_start = swp_cluster_offset(entry); 190 191 ci_end = ci_start + nr_pages; 191 192 ci_off = ci_start; 192 - offset = swp_offset(entry); 193 193 ci = swap_cluster_lock(si, swp_offset(entry)); 194 194 if (unlikely(!ci->table)) { 195 195 err = -ENOENT; ··· 200 202 err = -EEXIST; 201 203 goto failed; 202 204 } 203 - if (unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) { 205 + if (unlikely(!__swp_tb_get_count(old_tb))) { 204 206 err = -ENOENT; 205 207 goto failed; 206 208 } 207 209 if (swp_tb_is_shadow(old_tb)) 208 210 shadow = swp_tb_to_shadow(old_tb); 209 - offset++; 210 211 } while (++ci_off < ci_end); 211 212 __swap_cache_add_folio(ci, folio, entry); 212 213 swap_cluster_unlock(ci); ··· 234 237 void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, 235 238 swp_entry_t entry, void *shadow) 236 239 { 240 + int count; 241 + unsigned long old_tb; 237 242 struct swap_info_struct *si; 238 - unsigned long old_tb, new_tb; 239 243 unsigned int ci_start, ci_off, ci_end; 240 244 bool folio_swapped = false, need_free = false; 241 245 unsigned long nr_pages = folio_nr_pages(folio); ··· 247 249 VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); 248 250 249 251 si = __swap_entry_to_info(entry); 250 - new_tb = shadow_to_swp_tb(shadow, 0); 251 252 ci_start = swp_cluster_offset(entry); 252 253 ci_end = ci_start + nr_pages; 253 254 ci_off = ci_start; 254 255 do { 255 - /* If shadow is NULL, we sets an empty shadow */ 256 - old_tb = __swap_table_xchg(ci, ci_off, new_tb); 256 + old_tb = __swap_table_get(ci, ci_off); 257 257 WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || 258 258 swp_tb_to_folio(old_tb) != folio); 259 - if (__swap_count(swp_entry(si->type, 260 - swp_offset(entry) + ci_off - ci_start))) 259 + count = __swp_tb_get_count(old_tb); 260 + if (count) 261 261 folio_swapped = true; 262 262 else 263 263 need_free = true; 264 + /* If shadow is NULL, we sets an empty shadow. */ 265 + __swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow, count)); 264 266 } while (++ci_off < ci_end); 265 267 266 268 folio->swap.val = 0; ··· 269 271 lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); 270 272 271 273 if (!folio_swapped) { 272 - swap_entries_free(si, ci, swp_offset(entry), nr_pages); 274 + __swap_cluster_free_entries(si, ci, ci_start, nr_pages); 273 275 } else if (need_free) { 276 + ci_off = ci_start; 274 277 do { 275 - if (!__swap_count(entry)) 276 - swap_entries_free(si, ci, swp_offset(entry), 1); 277 - entry.val++; 278 - } while (--nr_pages); 278 + if (!__swp_tb_get_count(__swap_table_get(ci, ci_off))) 279 + __swap_cluster_free_entries(si, ci, ci_off, 1); 280 + } while (++ci_off < ci_end); 279 281 } 280 282 } 281 283 ··· 322 324 unsigned long nr_pages = folio_nr_pages(new); 323 325 unsigned int ci_off = swp_cluster_offset(entry); 324 326 unsigned int ci_end = ci_off + nr_pages; 325 - unsigned long old_tb, new_tb; 327 + unsigned long pfn = folio_pfn(new); 328 + unsigned long old_tb; 326 329 327 330 VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); 328 331 VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); 329 332 VM_WARN_ON_ONCE(!entry.val); 330 333 331 334 /* Swap cache still stores N entries instead of a high-order entry */ 332 - new_tb = folio_to_swp_tb(new, 0); 333 335 do { 334 - old_tb = __swap_table_xchg(ci, ci_off, new_tb); 336 + old_tb = __swap_table_get(ci, ci_off); 335 337 WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); 338 + __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); 336 339 } while (++ci_off < ci_end); 337 340 338 341 /* ··· 367 368 ci_end = ci_off + nr_ents; 368 369 do { 369 370 old = __swap_table_xchg(ci, ci_off, null_to_swp_tb()); 370 - WARN_ON_ONCE(swp_tb_is_folio(old)); 371 + WARN_ON_ONCE(swp_tb_is_folio(old) || swp_tb_get_count(old)); 371 372 } while (++ci_off < ci_end); 372 373 } 373 374

+5

mm/swap_table.h

··· 191 191 return -EINVAL; 192 192 } 193 193 194 + static inline unsigned long __swp_tb_mk_count(unsigned long swp_tb, int count) 195 + { 196 + return ((swp_tb & ~SWP_TB_COUNT_MASK) | __count_to_swp_tb(count)); 197 + } 198 + 194 199 /* 195 200 * Helpers for accessing or modifying the swap table of a cluster, 196 201 * the swap cluster must be locked.

+290 -504

mm/swapfile.c

··· 51 51 #include "swap_table.h" 52 52 #include "swap.h" 53 53 54 - static bool swap_count_continued(struct swap_info_struct *, pgoff_t, 55 - unsigned char); 56 - static void free_swap_count_continuations(struct swap_info_struct *); 57 54 static void swap_range_alloc(struct swap_info_struct *si, 58 55 unsigned int nr_entries); 59 - static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr); 60 - static void swap_put_entry_locked(struct swap_info_struct *si, 61 - struct swap_cluster_info *ci, 62 - unsigned long offset); 63 56 static bool folio_swapcache_freeable(struct folio *folio); 64 57 static void move_cluster(struct swap_info_struct *si, 65 58 struct swap_cluster_info *ci, struct list_head *list, ··· 175 182 /* Reclaim the swap entry if swap is getting full */ 176 183 #define TTRS_FULL 0x4 177 184 178 - static bool swap_only_has_cache(struct swap_info_struct *si, 179 - struct swap_cluster_info *ci, 185 + static bool swap_only_has_cache(struct swap_cluster_info *ci, 180 186 unsigned long offset, int nr_pages) 181 187 { 182 188 unsigned int ci_off = offset % SWAPFILE_CLUSTER; 183 - unsigned char *map = si->swap_map + offset; 184 - unsigned char *map_end = map + nr_pages; 189 + unsigned int ci_end = ci_off + nr_pages; 185 190 unsigned long swp_tb; 186 191 187 192 do { 188 193 swp_tb = __swap_table_get(ci, ci_off); 189 194 VM_WARN_ON_ONCE(!swp_tb_is_folio(swp_tb)); 190 - if (*map) 195 + if (swp_tb_get_count(swp_tb)) 191 196 return false; 192 - ++ci_off; 193 - } while (++map < map_end); 197 + } while (++ci_off < ci_end); 194 198 195 199 return true; 196 200 } ··· 246 256 * reference or pending writeback, and can't be allocated to others. 247 257 */ 248 258 ci = swap_cluster_lock(si, offset); 249 - need_reclaim = swap_only_has_cache(si, ci, offset, nr_pages); 259 + need_reclaim = swap_only_has_cache(ci, offset, nr_pages); 250 260 swap_cluster_unlock(ci); 251 261 if (!need_reclaim) 252 262 goto out_unlock; ··· 469 479 } while (++ci_off < ci_end); 470 480 471 481 WARN_ON_ONCE(bad_slots != (swapoff ? ci->count : 0)); 482 + WARN_ON_ONCE(nr == SWAPFILE_CLUSTER && ci->extend_table); 472 483 } 473 484 474 485 static void swap_cluster_free_table(struct swap_cluster_info *ci) ··· 798 807 pr_warn("Duplicated bad slot offset %d\n", offset); 799 808 ret = -EINVAL; 800 809 } else { 801 - si->swap_map[offset] = SWAP_MAP_BAD; 802 810 ci->count++; 803 811 } 804 812 spin_unlock(&ci->lock); ··· 819 829 { 820 830 unsigned int nr_pages = 1 << order; 821 831 unsigned long offset = start, end = start + nr_pages; 822 - unsigned char *map = si->swap_map; 823 832 unsigned long swp_tb; 824 833 825 834 spin_unlock(&ci->lock); 826 835 do { 827 - if (READ_ONCE(map[offset])) 828 - break; 829 836 swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); 830 - if (swp_tb_is_folio(swp_tb)) { 837 + if (swp_tb_get_count(swp_tb)) 838 + break; 839 + if (swp_tb_is_folio(swp_tb)) 831 840 if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY) < 0) 832 841 break; 833 - } 834 842 } while (++offset < end); 835 843 spin_lock(&ci->lock); 836 844 ··· 852 864 */ 853 865 for (offset = start; offset < end; offset++) { 854 866 swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); 855 - if (map[offset] || !swp_tb_is_null(swp_tb)) 867 + if (!swp_tb_is_null(swp_tb)) 856 868 return false; 857 869 } 858 870 ··· 864 876 unsigned long offset, unsigned int nr_pages, 865 877 bool *need_reclaim) 866 878 { 867 - unsigned long end = offset + nr_pages; 868 - unsigned char *map = si->swap_map; 879 + unsigned int ci_off = offset % SWAPFILE_CLUSTER; 880 + unsigned int ci_end = ci_off + nr_pages; 869 881 unsigned long swp_tb; 870 882 871 - if (cluster_is_empty(ci)) 872 - return true; 873 - 874 883 do { 875 - if (map[offset]) 876 - return false; 877 - swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); 878 - if (swp_tb_is_folio(swp_tb)) { 884 + swp_tb = __swap_table_get(ci, ci_off); 885 + if (swp_tb_is_null(swp_tb)) 886 + continue; 887 + if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) { 879 888 if (!vm_swap_full()) 880 889 return false; 881 890 *need_reclaim = true; 882 - } else { 883 - /* A entry with no count and no cache must be null */ 884 - VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); 891 + continue; 885 892 } 886 - } while (++offset < end); 893 + /* Slot with zero count can only be NULL or folio */ 894 + VM_WARN_ON(!swp_tb_get_count(swp_tb)); 895 + return false; 896 + } while (++ci_off < ci_end); 887 897 888 898 return true; 889 899 } 890 900 891 - static bool cluster_alloc_range(struct swap_info_struct *si, 892 - struct swap_cluster_info *ci, 893 - struct folio *folio, 894 - unsigned int offset) 901 + static bool __swap_cluster_alloc_entries(struct swap_info_struct *si, 902 + struct swap_cluster_info *ci, 903 + struct folio *folio, 904 + unsigned int ci_off) 895 905 { 896 - unsigned long nr_pages; 897 906 unsigned int order; 907 + unsigned long nr_pages; 898 908 899 909 lockdep_assert_held(&ci->lock); 900 910 ··· 911 925 if (likely(folio)) { 912 926 order = folio_order(folio); 913 927 nr_pages = 1 << order; 914 - swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, nr_pages, false); 915 - __swap_cache_add_folio(ci, folio, swp_entry(si->type, offset)); 928 + swap_cluster_assert_empty(ci, ci_off, nr_pages, false); 929 + __swap_cache_add_folio(ci, folio, swp_entry(si->type, 930 + ci_off + cluster_offset(si, ci))); 916 931 } else if (IS_ENABLED(CONFIG_HIBERNATION)) { 917 932 order = 0; 918 933 nr_pages = 1; 919 - WARN_ON_ONCE(si->swap_map[offset]); 920 - si->swap_map[offset] = 1; 921 - swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, 1, false); 934 + swap_cluster_assert_empty(ci, ci_off, 1, false); 935 + /* Sets a fake shadow as placeholder */ 936 + __swap_table_set(ci, ci_off, shadow_to_swp_tb(NULL, 1)); 922 937 } else { 923 938 /* Allocation without folio is only possible with hibernation */ 924 939 WARN_ON_ONCE(1); ··· 970 983 if (!ret) 971 984 continue; 972 985 } 973 - if (!cluster_alloc_range(si, ci, folio, offset)) 986 + if (!__swap_cluster_alloc_entries(si, ci, folio, offset % SWAPFILE_CLUSTER)) 974 987 break; 975 988 found = offset; 976 989 offset += nr_pages; ··· 1017 1030 long to_scan = 1; 1018 1031 unsigned long offset, end; 1019 1032 struct swap_cluster_info *ci; 1020 - unsigned char *map = si->swap_map; 1033 + unsigned long swp_tb; 1021 1034 int nr_reclaim; 1022 1035 1023 1036 if (force) ··· 1029 1042 to_scan--; 1030 1043 1031 1044 while (offset < end) { 1032 - if (!READ_ONCE(map[offset]) && 1033 - swp_tb_is_folio(swap_table_get(ci, offset % SWAPFILE_CLUSTER))) { 1045 + swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); 1046 + if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) { 1034 1047 spin_unlock(&ci->lock); 1035 1048 nr_reclaim = __try_to_reclaim_swap(si, offset, 1036 1049 TTRS_ANYWAY); ··· 1439 1452 return false; 1440 1453 } 1441 1454 1455 + static int swap_extend_table_alloc(struct swap_info_struct *si, 1456 + struct swap_cluster_info *ci, gfp_t gfp) 1457 + { 1458 + void *table; 1459 + 1460 + table = kzalloc(sizeof(ci->extend_table[0]) * SWAPFILE_CLUSTER, gfp); 1461 + if (!table) 1462 + return -ENOMEM; 1463 + 1464 + spin_lock(&ci->lock); 1465 + if (!ci->extend_table) 1466 + ci->extend_table = table; 1467 + else 1468 + kfree(table); 1469 + spin_unlock(&ci->lock); 1470 + return 0; 1471 + } 1472 + 1473 + int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp) 1474 + { 1475 + int ret; 1476 + struct swap_info_struct *si; 1477 + struct swap_cluster_info *ci; 1478 + unsigned long offset = swp_offset(entry); 1479 + 1480 + si = get_swap_device(entry); 1481 + if (!si) 1482 + return 0; 1483 + 1484 + ci = __swap_offset_to_cluster(si, offset); 1485 + ret = swap_extend_table_alloc(si, ci, gfp); 1486 + 1487 + put_swap_device(si); 1488 + return ret; 1489 + } 1490 + 1491 + static void swap_extend_table_try_free(struct swap_cluster_info *ci) 1492 + { 1493 + unsigned long i; 1494 + bool can_free = true; 1495 + 1496 + if (!ci->extend_table) 1497 + return; 1498 + 1499 + for (i = 0; i < SWAPFILE_CLUSTER; i++) { 1500 + if (ci->extend_table[i]) 1501 + can_free = false; 1502 + } 1503 + 1504 + if (can_free) { 1505 + kfree(ci->extend_table); 1506 + ci->extend_table = NULL; 1507 + } 1508 + } 1509 + 1510 + /* Decrease the swap count of one slot, without freeing it */ 1511 + static void __swap_cluster_put_entry(struct swap_cluster_info *ci, 1512 + unsigned int ci_off) 1513 + { 1514 + int count; 1515 + unsigned long swp_tb; 1516 + 1517 + lockdep_assert_held(&ci->lock); 1518 + swp_tb = __swap_table_get(ci, ci_off); 1519 + count = __swp_tb_get_count(swp_tb); 1520 + 1521 + VM_WARN_ON_ONCE(count <= 0); 1522 + VM_WARN_ON_ONCE(count > SWP_TB_COUNT_MAX); 1523 + 1524 + if (count == SWP_TB_COUNT_MAX) { 1525 + count = ci->extend_table[ci_off]; 1526 + /* Overflow starts with SWP_TB_COUNT_MAX */ 1527 + VM_WARN_ON_ONCE(count < SWP_TB_COUNT_MAX); 1528 + count--; 1529 + if (count == (SWP_TB_COUNT_MAX - 1)) { 1530 + ci->extend_table[ci_off] = 0; 1531 + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count)); 1532 + swap_extend_table_try_free(ci); 1533 + } else { 1534 + ci->extend_table[ci_off] = count; 1535 + } 1536 + } else { 1537 + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, --count)); 1538 + } 1539 + } 1540 + 1442 1541 /** 1443 - * swap_put_entries_cluster - Decrease the swap count of a set of slots. 1542 + * swap_put_entries_cluster - Decrease the swap count of slots within one cluster 1444 1543 * @si: The swap device. 1445 - * @start: start offset of slots. 1544 + * @offset: start offset of slots. 1446 1545 * @nr: number of slots. 1447 - * @reclaim_cache: if true, also reclaim the swap cache. 1546 + * @reclaim_cache: if true, also reclaim the swap cache if slots are freed. 1448 1547 * 1449 1548 * This helper decreases the swap count of a set of slots and tries to 1450 1549 * batch free them. Also reclaims the swap cache if @reclaim_cache is true. 1451 - * Context: The caller must ensure that all slots belong to the same 1452 - * cluster and their swap count doesn't go underflow. 1550 + * 1551 + * Context: The specified slots must be pinned by existing swap count or swap 1552 + * cache reference, so they won't be released until this helper returns. 1453 1553 */ 1454 1554 static void swap_put_entries_cluster(struct swap_info_struct *si, 1455 - unsigned long start, int nr, 1555 + pgoff_t offset, int nr, 1456 1556 bool reclaim_cache) 1457 1557 { 1458 - unsigned long offset = start, end = start + nr; 1459 - unsigned long batch_start = SWAP_ENTRY_INVALID; 1460 1558 struct swap_cluster_info *ci; 1559 + unsigned int ci_off, ci_end; 1560 + pgoff_t end = offset + nr; 1461 1561 bool need_reclaim = false; 1462 1562 unsigned int nr_reclaimed; 1463 1563 unsigned long swp_tb; 1464 - unsigned int count; 1564 + int ci_batch = -1; 1465 1565 1466 1566 ci = swap_cluster_lock(si, offset); 1567 + ci_off = offset % SWAPFILE_CLUSTER; 1568 + ci_end = ci_off + nr; 1467 1569 do { 1468 - swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); 1469 - count = si->swap_map[offset]; 1470 - VM_WARN_ON(count < 1 || count == SWAP_MAP_BAD); 1471 - if (count == 1) { 1570 + swp_tb = __swap_table_get(ci, ci_off); 1571 + if (swp_tb_get_count(swp_tb) == 1) { 1472 1572 /* count == 1 and non-cached slots will be batch freed. */ 1473 1573 if (!swp_tb_is_folio(swp_tb)) { 1474 - if (!batch_start) 1475 - batch_start = offset; 1574 + if (ci_batch == -1) 1575 + ci_batch = ci_off; 1476 1576 continue; 1477 1577 } 1478 1578 /* count will be 0 after put, slot can be reclaimed */ ··· 1571 1497 * slots will be freed when folio is removed from swap cache 1572 1498 * (__swap_cache_del_folio). 1573 1499 */ 1574 - swap_put_entry_locked(si, ci, offset); 1575 - if (batch_start) { 1576 - swap_entries_free(si, ci, batch_start, offset - batch_start); 1577 - batch_start = SWAP_ENTRY_INVALID; 1500 + __swap_cluster_put_entry(ci, ci_off); 1501 + if (ci_batch != -1) { 1502 + __swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch); 1503 + ci_batch = -1; 1578 1504 } 1579 - } while (++offset < end); 1505 + } while (++ci_off < ci_end); 1580 1506 1581 - if (batch_start) 1582 - swap_entries_free(si, ci, batch_start, offset - batch_start); 1507 + if (ci_batch != -1) 1508 + __swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch); 1583 1509 swap_cluster_unlock(ci); 1584 1510 1585 1511 if (!need_reclaim || !reclaim_cache) 1586 1512 return; 1587 1513 1588 - offset = start; 1589 1514 do { 1590 1515 nr_reclaimed = __try_to_reclaim_swap(si, offset, 1591 1516 TTRS_UNMAPPED | TTRS_FULL); ··· 1592 1519 if (nr_reclaimed) 1593 1520 offset = round_up(offset, abs(nr_reclaimed)); 1594 1521 } while (offset < end); 1522 + } 1523 + 1524 + /* Increase the swap count of one slot. */ 1525 + static int __swap_cluster_dup_entry(struct swap_cluster_info *ci, 1526 + unsigned int ci_off) 1527 + { 1528 + int count; 1529 + unsigned long swp_tb; 1530 + 1531 + lockdep_assert_held(&ci->lock); 1532 + swp_tb = __swap_table_get(ci, ci_off); 1533 + /* Bad or special slots can't be handled */ 1534 + if (WARN_ON_ONCE(swp_tb_is_bad(swp_tb))) 1535 + return -EINVAL; 1536 + count = __swp_tb_get_count(swp_tb); 1537 + /* Must be either cached or have a count already */ 1538 + if (WARN_ON_ONCE(!count && !swp_tb_is_folio(swp_tb))) 1539 + return -ENOENT; 1540 + 1541 + if (likely(count < (SWP_TB_COUNT_MAX - 1))) { 1542 + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count + 1)); 1543 + VM_WARN_ON_ONCE(ci->extend_table && ci->extend_table[ci_off]); 1544 + } else if (count == (SWP_TB_COUNT_MAX - 1)) { 1545 + if (ci->extend_table) { 1546 + VM_WARN_ON_ONCE(ci->extend_table[ci_off]); 1547 + ci->extend_table[ci_off] = SWP_TB_COUNT_MAX; 1548 + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, SWP_TB_COUNT_MAX)); 1549 + } else { 1550 + return -ENOMEM; 1551 + } 1552 + } else if (count == SWP_TB_COUNT_MAX) { 1553 + VM_WARN_ON_ONCE(ci->extend_table[ci_off] >= 1554 + type_max(typeof(ci->extend_table[0]))); 1555 + ++ci->extend_table[ci_off]; 1556 + } else { 1557 + /* Never happens unless counting went wrong */ 1558 + WARN_ON_ONCE(1); 1559 + } 1560 + 1561 + return 0; 1562 + } 1563 + 1564 + /** 1565 + * swap_dup_entries_cluster: Increase the swap count of slots within one cluster. 1566 + * @si: The swap device. 1567 + * @offset: start offset of slots. 1568 + * @nr: number of slots. 1569 + * 1570 + * Context: The specified slots must be pinned by existing swap count or swap 1571 + * cache reference, so they won't be released until this helper returns. 1572 + * Return: 0 on success. -ENOMEM if the swap count maxed out (SWP_TB_COUNT_MAX) 1573 + * and failed to allocate an extended table, -EINVAL if any entry is bad entry. 1574 + */ 1575 + static int swap_dup_entries_cluster(struct swap_info_struct *si, 1576 + pgoff_t offset, int nr) 1577 + { 1578 + int err; 1579 + struct swap_cluster_info *ci; 1580 + unsigned int ci_start, ci_off, ci_end; 1581 + 1582 + ci_start = offset % SWAPFILE_CLUSTER; 1583 + ci_end = ci_start + nr; 1584 + ci_off = ci_start; 1585 + ci = swap_cluster_lock(si, offset); 1586 + restart: 1587 + do { 1588 + err = __swap_cluster_dup_entry(ci, ci_off); 1589 + if (unlikely(err)) { 1590 + if (err == -ENOMEM) { 1591 + spin_unlock(&ci->lock); 1592 + err = swap_extend_table_alloc(si, ci, GFP_ATOMIC); 1593 + spin_lock(&ci->lock); 1594 + if (!err) 1595 + goto restart; 1596 + } 1597 + goto failed; 1598 + } 1599 + } while (++ci_off < ci_end); 1600 + swap_cluster_unlock(ci); 1601 + return 0; 1602 + failed: 1603 + while (ci_off-- > ci_start) 1604 + __swap_cluster_put_entry(ci, ci_off); 1605 + swap_extend_table_try_free(ci); 1606 + swap_cluster_unlock(ci); 1607 + return err; 1595 1608 } 1596 1609 1597 1610 /** ··· 1748 1589 * Context: Caller must ensure the folio is locked and in the swap cache. 1749 1590 * NOTE: The caller also has to ensure there is no raced call to 1750 1591 * swap_put_entries_direct on its swap entry before this helper returns, or 1751 - * the swap map may underflow. Currently, we only accept @subpage == NULL 1752 - * for shmem due to the limitation of swap continuation: shmem always 1753 - * duplicates the swap entry only once, so there is no such issue for it. 1592 + * the swap count may underflow. 1754 1593 */ 1755 1594 int folio_dup_swap(struct folio *folio, struct page *subpage) 1756 1595 { 1757 - int err = 0; 1758 1596 swp_entry_t entry = folio->swap; 1759 1597 unsigned long nr_pages = folio_nr_pages(folio); 1760 1598 ··· 1763 1607 nr_pages = 1; 1764 1608 } 1765 1609 1766 - while (!err && __swap_duplicate(entry, 1, nr_pages) == -ENOMEM) 1767 - err = add_swap_count_continuation(entry, GFP_ATOMIC); 1768 - 1769 - return err; 1610 + return swap_dup_entries_cluster(swap_entry_to_info(entry), 1611 + swp_offset(entry), nr_pages); 1770 1612 } 1771 1613 1772 1614 /** ··· 1791 1637 } 1792 1638 1793 1639 swap_put_entries_cluster(si, swp_offset(entry), nr_pages, false); 1794 - } 1795 - 1796 - static void swap_put_entry_locked(struct swap_info_struct *si, 1797 - struct swap_cluster_info *ci, 1798 - unsigned long offset) 1799 - { 1800 - unsigned char count; 1801 - 1802 - count = si->swap_map[offset]; 1803 - if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { 1804 - if (count == COUNT_CONTINUED) { 1805 - if (swap_count_continued(si, offset, count)) 1806 - count = SWAP_MAP_MAX | COUNT_CONTINUED; 1807 - else 1808 - count = SWAP_MAP_MAX; 1809 - } else 1810 - count--; 1811 - } 1812 - 1813 - WRITE_ONCE(si->swap_map[offset], count); 1814 - if (!count && !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))) 1815 - swap_entries_free(si, ci, offset, 1); 1816 1640 } 1817 1641 1818 1642 /* ··· 1859 1727 } 1860 1728 1861 1729 /* 1862 - * Drop the last ref of swap entries, caller have to ensure all entries 1863 - * belong to the same cgroup and cluster. 1730 + * Free a set of swap slots after their swap count dropped to zero, or will be 1731 + * zero after putting the last ref (saves one __swap_cluster_put_entry call). 1864 1732 */ 1865 - void swap_entries_free(struct swap_info_struct *si, 1866 - struct swap_cluster_info *ci, 1867 - unsigned long offset, unsigned int nr_pages) 1733 + void __swap_cluster_free_entries(struct swap_info_struct *si, 1734 + struct swap_cluster_info *ci, 1735 + unsigned int ci_start, unsigned int nr_pages) 1868 1736 { 1869 - swp_entry_t entry = swp_entry(si->type, offset); 1870 - unsigned char *map = si->swap_map + offset; 1871 - unsigned char *map_end = map + nr_pages; 1737 + unsigned long old_tb; 1738 + unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages; 1739 + unsigned long offset = cluster_offset(si, ci) + ci_start; 1872 1740 1873 - /* It should never free entries across different clusters */ 1874 - VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1)); 1875 - VM_BUG_ON(cluster_is_empty(ci)); 1876 - VM_BUG_ON(ci->count < nr_pages); 1741 + VM_WARN_ON(ci->count < nr_pages); 1877 1742 1878 1743 ci->count -= nr_pages; 1879 1744 do { 1880 - VM_WARN_ON(*map > 1); 1881 - *map = 0; 1882 - } while (++map < map_end); 1745 + old_tb = __swap_table_get(ci, ci_off); 1746 + /* Release the last ref, or after swap cache is dropped */ 1747 + VM_WARN_ON(!swp_tb_is_shadow(old_tb) || __swp_tb_get_count(old_tb) > 1); 1748 + __swap_table_set(ci, ci_off, null_to_swp_tb()); 1749 + } while (++ci_off < ci_end); 1883 1750 1884 - mem_cgroup_uncharge_swap(entry, nr_pages); 1751 + mem_cgroup_uncharge_swap(swp_entry(si->type, offset), nr_pages); 1885 1752 swap_range_free(si, offset, nr_pages); 1886 - swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, nr_pages, false); 1753 + swap_cluster_assert_empty(ci, ci_start, nr_pages, false); 1887 1754 1888 1755 if (!ci->count) 1889 1756 free_cluster(si, ci); ··· 1892 1761 1893 1762 int __swap_count(swp_entry_t entry) 1894 1763 { 1895 - struct swap_info_struct *si = __swap_entry_to_info(entry); 1896 - pgoff_t offset = swp_offset(entry); 1764 + struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); 1765 + unsigned int ci_off = swp_cluster_offset(entry); 1897 1766 1898 - return si->swap_map[offset]; 1767 + return swp_tb_get_count(__swap_table_get(ci, ci_off)); 1899 1768 } 1900 1769 1901 1770 /** ··· 1907 1776 { 1908 1777 pgoff_t offset = swp_offset(entry); 1909 1778 struct swap_cluster_info *ci; 1910 - int count; 1779 + unsigned long swp_tb; 1911 1780 1912 1781 ci = swap_cluster_lock(si, offset); 1913 - count = si->swap_map[offset]; 1782 + swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); 1914 1783 swap_cluster_unlock(ci); 1915 1784 1916 - return count && count != SWAP_MAP_BAD; 1785 + return swp_tb_get_count(swp_tb) > 0; 1917 1786 } 1918 1787 1919 1788 /* 1920 1789 * How many references to @entry are currently swapped out? 1921 - * This considers COUNT_CONTINUED so it returns exact answer. 1790 + * This returns exact answer. 1922 1791 */ 1923 1792 int swp_swapcount(swp_entry_t entry) 1924 1793 { 1925 - int count, tmp_count, n; 1926 1794 struct swap_info_struct *si; 1927 1795 struct swap_cluster_info *ci; 1928 - struct page *page; 1929 - pgoff_t offset; 1930 - unsigned char *map; 1796 + unsigned long swp_tb; 1797 + int count; 1931 1798 1932 1799 si = get_swap_device(entry); 1933 1800 if (!si) 1934 1801 return 0; 1935 1802 1936 - offset = swp_offset(entry); 1937 - 1938 - ci = swap_cluster_lock(si, offset); 1939 - 1940 - count = si->swap_map[offset]; 1941 - if (!(count & COUNT_CONTINUED)) 1942 - goto out; 1943 - 1944 - count &= ~COUNT_CONTINUED; 1945 - n = SWAP_MAP_MAX + 1; 1946 - 1947 - page = vmalloc_to_page(si->swap_map + offset); 1948 - offset &= ~PAGE_MASK; 1949 - VM_BUG_ON(page_private(page) != SWP_CONTINUED); 1950 - 1951 - do { 1952 - page = list_next_entry(page, lru); 1953 - map = kmap_local_page(page); 1954 - tmp_count = map[offset]; 1955 - kunmap_local(map); 1956 - 1957 - count += (tmp_count & ~COUNT_CONTINUED) * n; 1958 - n *= (SWAP_CONT_MAX + 1); 1959 - } while (tmp_count & COUNT_CONTINUED); 1960 - out: 1803 + ci = swap_cluster_lock(si, swp_offset(entry)); 1804 + swp_tb = __swap_table_get(ci, swp_cluster_offset(entry)); 1805 + count = swp_tb_get_count(swp_tb); 1806 + if (count == SWP_TB_COUNT_MAX) 1807 + count = ci->extend_table[swp_cluster_offset(entry)]; 1961 1808 swap_cluster_unlock(ci); 1962 1809 put_swap_device(si); 1963 - return count; 1810 + 1811 + return count < 0 ? 0 : count; 1964 1812 } 1965 1813 1966 1814 static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, 1967 1815 swp_entry_t entry, int order) 1968 1816 { 1969 1817 struct swap_cluster_info *ci; 1970 - unsigned char *map = si->swap_map; 1971 1818 unsigned int nr_pages = 1 << order; 1972 1819 unsigned long roffset = swp_offset(entry); 1973 1820 unsigned long offset = round_down(roffset, nr_pages); 1821 + unsigned int ci_off; 1974 1822 int i; 1975 1823 bool ret = false; 1976 1824 1977 1825 ci = swap_cluster_lock(si, offset); 1978 1826 if (nr_pages == 1) { 1979 - if (map[roffset]) 1827 + ci_off = roffset % SWAPFILE_CLUSTER; 1828 + if (swp_tb_get_count(__swap_table_get(ci, ci_off))) 1980 1829 ret = true; 1981 1830 goto unlock_out; 1982 1831 } 1983 1832 for (i = 0; i < nr_pages; i++) { 1984 - if (map[offset + i]) { 1833 + ci_off = (offset + i) % SWAPFILE_CLUSTER; 1834 + if (swp_tb_get_count(__swap_table_get(ci, ci_off))) { 1985 1835 ret = true; 1986 1836 break; 1987 1837 } ··· 2128 2016 return; 2129 2017 2130 2018 ci = swap_cluster_lock(si, offset); 2131 - swap_put_entry_locked(si, ci, offset); 2019 + __swap_cluster_put_entry(ci, offset % SWAPFILE_CLUSTER); 2020 + __swap_cluster_free_entries(si, ci, offset % SWAPFILE_CLUSTER, 1); 2132 2021 swap_cluster_unlock(ci); 2133 2022 2134 2023 /* In theory readahead might add it to the swap cache by accident */ ··· 2355 2242 unsigned int type) 2356 2243 { 2357 2244 pte_t *pte = NULL; 2358 - struct swap_info_struct *si; 2359 2245 2360 - si = swap_info[type]; 2361 2246 do { 2362 2247 struct folio *folio; 2363 - unsigned long offset; 2364 - unsigned char swp_count; 2248 + unsigned long swp_tb; 2365 2249 softleaf_t entry; 2366 2250 int ret; 2367 2251 pte_t ptent; ··· 2377 2267 if (swp_type(entry) != type) 2378 2268 continue; 2379 2269 2380 - offset = swp_offset(entry); 2381 2270 pte_unmap(pte); 2382 2271 pte = NULL; 2383 2272 ··· 2393 2284 &vmf); 2394 2285 } 2395 2286 if (!folio) { 2396 - swp_count = READ_ONCE(si->swap_map[offset]); 2397 - if (swp_count == 0 || swp_count == SWAP_MAP_BAD) 2287 + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), 2288 + swp_cluster_offset(entry)); 2289 + if (swp_tb_get_count(swp_tb) <= 0) 2398 2290 continue; 2399 2291 return -ENOMEM; 2400 2292 } ··· 2523 2413 } 2524 2414 2525 2415 /* 2526 - * Scan swap_map from current position to next entry still in use. 2416 + * Scan swap table from current position to next entry still in use. 2527 2417 * Return 0 if there are no inuse entries after prev till end of 2528 2418 * the map. 2529 2419 */ ··· 2532 2422 { 2533 2423 unsigned int i; 2534 2424 unsigned long swp_tb; 2535 - unsigned char count; 2536 2425 2537 2426 /* 2538 2427 * No need for swap_lock here: we're just looking ··· 2540 2431 * allocations from this area (while holding swap_lock). 2541 2432 */ 2542 2433 for (i = prev + 1; i < si->max; i++) { 2543 - count = READ_ONCE(si->swap_map[i]); 2544 2434 swp_tb = swap_table_get(__swap_offset_to_cluster(si, i), 2545 2435 i % SWAPFILE_CLUSTER); 2546 - if (count == SWAP_MAP_BAD) 2547 - continue; 2548 - if (count || swp_tb_is_folio(swp_tb)) 2436 + if (!swp_tb_is_null(swp_tb) && !swp_tb_is_bad(swp_tb)) 2549 2437 break; 2550 2438 if ((i % LATENCY_LIMIT) == 0) 2551 2439 cond_resched(); ··· 2902 2796 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 2903 2797 { 2904 2798 struct swap_info_struct *p = NULL; 2905 - unsigned char *swap_map; 2906 2799 unsigned long *zeromap; 2907 2800 struct swap_cluster_info *cluster_info; 2908 2801 struct file *swap_file, *victim; ··· 2979 2874 flush_percpu_swap_cluster(p); 2980 2875 2981 2876 destroy_swap_extents(p, p->swap_file); 2982 - if (p->flags & SWP_CONTINUED) 2983 - free_swap_count_continuations(p); 2984 2877 2985 2878 if (!(p->flags & SWP_SOLIDSTATE)) 2986 2879 atomic_dec(&nr_rotate_swap); ··· 2990 2887 2991 2888 swap_file = p->swap_file; 2992 2889 p->swap_file = NULL; 2993 - swap_map = p->swap_map; 2994 - p->swap_map = NULL; 2995 2890 zeromap = p->zeromap; 2996 2891 p->zeromap = NULL; 2997 2892 maxpages = p->max; ··· 3003 2902 mutex_unlock(&swapon_mutex); 3004 2903 kfree(p->global_cluster); 3005 2904 p->global_cluster = NULL; 3006 - vfree(swap_map); 3007 2905 kvfree(zeromap); 3008 2906 free_swap_cluster_info(cluster_info, maxpages); 3009 2907 /* Destroy swap account information */ ··· 3222 3122 kvfree(defer); 3223 3123 } 3224 3124 spin_lock_init(&p->lock); 3225 - spin_lock_init(&p->cont_lock); 3226 3125 atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT); 3227 3126 init_completion(&p->comp); 3228 3127 ··· 3346 3247 return 0; 3347 3248 3348 3249 return maxpages; 3349 - } 3350 - 3351 - static int setup_swap_map(struct swap_info_struct *si, 3352 - union swap_header *swap_header, 3353 - unsigned long maxpages) 3354 - { 3355 - unsigned char *swap_map; 3356 - 3357 - swap_map = vzalloc(maxpages); 3358 - si->swap_map = swap_map; 3359 - if (!swap_map) 3360 - return -ENOMEM; 3361 - return 0; 3362 3250 } 3363 3251 3364 3252 static int setup_swap_clusters_info(struct swap_info_struct *si, ··· 3532 3446 3533 3447 maxpages = si->max; 3534 3448 3535 - /* Setup the swap map and apply bad block */ 3536 - error = setup_swap_map(si, swap_header, maxpages); 3537 - if (error) 3538 - goto bad_swap_unlock_inode; 3539 - 3540 3449 /* Set up the swap cluster info */ 3541 3450 error = setup_swap_clusters_info(si, swap_header, maxpages); 3542 3451 if (error) ··· 3652 3571 inode = NULL; 3653 3572 destroy_swap_extents(si, swap_file); 3654 3573 swap_cgroup_swapoff(si->type); 3655 - vfree(si->swap_map); 3656 - si->swap_map = NULL; 3657 3574 free_swap_cluster_info(si->cluster_info, si->max); 3658 3575 si->cluster_info = NULL; 3659 3576 kvfree(si->zeromap); ··· 3693 3614 } 3694 3615 3695 3616 /* 3696 - * Verify that nr swap entries are valid and increment their swap map counts. 3617 + * swap_dup_entry_direct() - Increase reference count of a swap entry by one. 3618 + * @entry: first swap entry from which we want to increase the refcount. 3697 3619 * 3698 - * Returns error code in following case. 3699 - * - success -> 0 3700 - * - swp_entry is invalid -> EINVAL 3701 - * - swap-mapped reference is requested but the entry is not used. -> ENOENT 3702 - * - swap-mapped reference requested but needs continued swap count. -> ENOMEM 3620 + * Returns 0 for success, or -ENOMEM if the extend table is required 3621 + * but could not be atomically allocated. Returns -EINVAL if the swap 3622 + * entry is invalid, which might occur if a page table entry has got 3623 + * corrupted. 3624 + * 3625 + * Context: Caller must ensure there is no race condition on the reference 3626 + * owner. e.g., locking the PTL of a PTE containing the entry being increased. 3703 3627 */ 3704 - static int swap_dup_entries(struct swap_info_struct *si, 3705 - struct swap_cluster_info *ci, 3706 - unsigned long offset, 3707 - unsigned char usage, int nr) 3628 + int swap_dup_entry_direct(swp_entry_t entry) 3708 3629 { 3709 - int i; 3710 - unsigned char count; 3711 - 3712 - for (i = 0; i < nr; i++) { 3713 - count = si->swap_map[offset + i]; 3714 - /* 3715 - * For swapin out, allocator never allocates bad slots. for 3716 - * swapin, readahead is guarded by swap_entry_swapped. 3717 - */ 3718 - if (WARN_ON(count == SWAP_MAP_BAD)) 3719 - return -ENOENT; 3720 - /* 3721 - * Swap count duplication must be guarded by either swap cache folio (from 3722 - * folio_dup_swap) or external lock of existing entry (from swap_dup_entry_direct). 3723 - */ 3724 - if (WARN_ON(!count && 3725 - !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER)))) 3726 - return -ENOENT; 3727 - if (WARN_ON((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)) 3728 - return -EINVAL; 3729 - } 3730 - 3731 - for (i = 0; i < nr; i++) { 3732 - count = si->swap_map[offset + i]; 3733 - if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) 3734 - count += usage; 3735 - else if (swap_count_continued(si, offset + i, count)) 3736 - count = COUNT_CONTINUED; 3737 - else { 3738 - /* 3739 - * Don't need to rollback changes, because if 3740 - * usage == 1, there must be nr == 1. 3741 - */ 3742 - return -ENOMEM; 3743 - } 3744 - 3745 - WRITE_ONCE(si->swap_map[offset + i], count); 3746 - } 3747 - 3748 - return 0; 3749 - } 3750 - 3751 - static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) 3752 - { 3753 - int err; 3754 3630 struct swap_info_struct *si; 3755 - struct swap_cluster_info *ci; 3756 - unsigned long offset = swp_offset(entry); 3757 3631 3758 3632 si = swap_entry_to_info(entry); 3759 3633 if (WARN_ON_ONCE(!si)) { ··· 3714 3682 return -EINVAL; 3715 3683 } 3716 3684 3717 - VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); 3718 - ci = swap_cluster_lock(si, offset); 3719 - err = swap_dup_entries(si, ci, offset, usage, nr); 3720 - swap_cluster_unlock(ci); 3721 - return err; 3722 - } 3723 - 3724 - /* 3725 - * swap_dup_entry_direct() - Increase reference count of a swap entry by one. 3726 - * @entry: first swap entry from which we want to increase the refcount. 3727 - * 3728 - * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required 3729 - * but could not be atomically allocated. Returns 0, just as if it succeeded, 3730 - * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which 3731 - * might occur if a page table entry has got corrupted. 3732 - * 3733 - * Context: Caller must ensure there is no race condition on the reference 3734 - * owner. e.g., locking the PTL of a PTE containing the entry being increased. 3735 - */ 3736 - int swap_dup_entry_direct(swp_entry_t entry) 3737 - { 3738 - int err = 0; 3739 - while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) 3740 - err = add_swap_count_continuation(entry, GFP_ATOMIC); 3741 - return err; 3742 - } 3743 - 3744 - /* 3745 - * add_swap_count_continuation - called when a swap count is duplicated 3746 - * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's 3747 - * page of the original vmalloc'ed swap_map, to hold the continuation count 3748 - * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called 3749 - * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. 3750 - * 3751 - * These continuation pages are seldom referenced: the common paths all work 3752 - * on the original swap_map, only referring to a continuation page when the 3753 - * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. 3754 - * 3755 - * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding 3756 - * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) 3757 - * can be called after dropping locks. 3758 - */ 3759 - int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) 3760 - { 3761 - struct swap_info_struct *si; 3762 - struct swap_cluster_info *ci; 3763 - struct page *head; 3764 - struct page *page; 3765 - struct page *list_page; 3766 - pgoff_t offset; 3767 - unsigned char count; 3768 - int ret = 0; 3769 - 3770 - /* 3771 - * When debugging, it's easier to use __GFP_ZERO here; but it's better 3772 - * for latency not to zero a page while GFP_ATOMIC and holding locks. 3773 - */ 3774 - page = alloc_page(gfp_mask | __GFP_HIGHMEM); 3775 - 3776 - si = get_swap_device(entry); 3777 - if (!si) { 3778 - /* 3779 - * An acceptable race has occurred since the failing 3780 - * __swap_duplicate(): the swap device may be swapoff 3781 - */ 3782 - goto outer; 3783 - } 3784 - 3785 - offset = swp_offset(entry); 3786 - 3787 - ci = swap_cluster_lock(si, offset); 3788 - 3789 - count = si->swap_map[offset]; 3790 - 3791 - if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { 3792 - /* 3793 - * The higher the swap count, the more likely it is that tasks 3794 - * will race to add swap count continuation: we need to avoid 3795 - * over-provisioning. 3796 - */ 3797 - goto out; 3798 - } 3799 - 3800 - if (!page) { 3801 - ret = -ENOMEM; 3802 - goto out; 3803 - } 3804 - 3805 - head = vmalloc_to_page(si->swap_map + offset); 3806 - offset &= ~PAGE_MASK; 3807 - 3808 - spin_lock(&si->cont_lock); 3809 - /* 3810 - * Page allocation does not initialize the page's lru field, 3811 - * but it does always reset its private field. 3812 - */ 3813 - if (!page_private(head)) { 3814 - BUG_ON(count & COUNT_CONTINUED); 3815 - INIT_LIST_HEAD(&head->lru); 3816 - set_page_private(head, SWP_CONTINUED); 3817 - si->flags |= SWP_CONTINUED; 3818 - } 3819 - 3820 - list_for_each_entry(list_page, &head->lru, lru) { 3821 - unsigned char *map; 3822 - 3823 - /* 3824 - * If the previous map said no continuation, but we've found 3825 - * a continuation page, free our allocation and use this one. 3826 - */ 3827 - if (!(count & COUNT_CONTINUED)) 3828 - goto out_unlock_cont; 3829 - 3830 - map = kmap_local_page(list_page) + offset; 3831 - count = *map; 3832 - kunmap_local(map); 3833 - 3834 - /* 3835 - * If this continuation count now has some space in it, 3836 - * free our allocation and use this one. 3837 - */ 3838 - if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) 3839 - goto out_unlock_cont; 3840 - } 3841 - 3842 - list_add_tail(&page->lru, &head->lru); 3843 - page = NULL; /* now it's attached, don't free it */ 3844 - out_unlock_cont: 3845 - spin_unlock(&si->cont_lock); 3846 - out: 3847 - swap_cluster_unlock(ci); 3848 - put_swap_device(si); 3849 - outer: 3850 - if (page) 3851 - __free_page(page); 3852 - return ret; 3853 - } 3854 - 3855 - /* 3856 - * swap_count_continued - when the original swap_map count is incremented 3857 - * from SWAP_MAP_MAX, check if there is already a continuation page to carry 3858 - * into, carry if so, or else fail until a new continuation page is allocated; 3859 - * when the original swap_map count is decremented from 0 with continuation, 3860 - * borrow from the continuation and report whether it still holds more. 3861 - * Called while __swap_duplicate() or caller of swap_put_entry_locked() 3862 - * holds cluster lock. 3863 - */ 3864 - static bool swap_count_continued(struct swap_info_struct *si, 3865 - pgoff_t offset, unsigned char count) 3866 - { 3867 - struct page *head; 3868 - struct page *page; 3869 - unsigned char *map; 3870 - bool ret; 3871 - 3872 - head = vmalloc_to_page(si->swap_map + offset); 3873 - if (page_private(head) != SWP_CONTINUED) { 3874 - BUG_ON(count & COUNT_CONTINUED); 3875 - return false; /* need to add count continuation */ 3876 - } 3877 - 3878 - spin_lock(&si->cont_lock); 3879 - offset &= ~PAGE_MASK; 3880 - page = list_next_entry(head, lru); 3881 - map = kmap_local_page(page) + offset; 3882 - 3883 - if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ 3884 - goto init_map; /* jump over SWAP_CONT_MAX checks */ 3885 - 3886 - if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ 3887 - /* 3888 - * Think of how you add 1 to 999 3889 - */ 3890 - while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { 3891 - kunmap_local(map); 3892 - page = list_next_entry(page, lru); 3893 - BUG_ON(page == head); 3894 - map = kmap_local_page(page) + offset; 3895 - } 3896 - if (*map == SWAP_CONT_MAX) { 3897 - kunmap_local(map); 3898 - page = list_next_entry(page, lru); 3899 - if (page == head) { 3900 - ret = false; /* add count continuation */ 3901 - goto out; 3902 - } 3903 - map = kmap_local_page(page) + offset; 3904 - init_map: *map = 0; /* we didn't zero the page */ 3905 - } 3906 - *map += 1; 3907 - kunmap_local(map); 3908 - while ((page = list_prev_entry(page, lru)) != head) { 3909 - map = kmap_local_page(page) + offset; 3910 - *map = COUNT_CONTINUED; 3911 - kunmap_local(map); 3912 - } 3913 - ret = true; /* incremented */ 3914 - 3915 - } else { /* decrementing */ 3916 - /* 3917 - * Think of how you subtract 1 from 1000 3918 - */ 3919 - BUG_ON(count != COUNT_CONTINUED); 3920 - while (*map == COUNT_CONTINUED) { 3921 - kunmap_local(map); 3922 - page = list_next_entry(page, lru); 3923 - BUG_ON(page == head); 3924 - map = kmap_local_page(page) + offset; 3925 - } 3926 - BUG_ON(*map == 0); 3927 - *map -= 1; 3928 - if (*map == 0) 3929 - count = 0; 3930 - kunmap_local(map); 3931 - while ((page = list_prev_entry(page, lru)) != head) { 3932 - map = kmap_local_page(page) + offset; 3933 - *map = SWAP_CONT_MAX | count; 3934 - count = COUNT_CONTINUED; 3935 - kunmap_local(map); 3936 - } 3937 - ret = count == COUNT_CONTINUED; 3938 - } 3939 - out: 3940 - spin_unlock(&si->cont_lock); 3941 - return ret; 3942 - } 3943 - 3944 - /* 3945 - * free_swap_count_continuations - swapoff free all the continuation pages 3946 - * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. 3947 - */ 3948 - static void free_swap_count_continuations(struct swap_info_struct *si) 3949 - { 3950 - pgoff_t offset; 3951 - 3952 - for (offset = 0; offset < si->max; offset += PAGE_SIZE) { 3953 - struct page *head; 3954 - head = vmalloc_to_page(si->swap_map + offset); 3955 - if (page_private(head)) { 3956 - struct page *page, *next; 3957 - 3958 - list_for_each_entry_safe(page, next, &head->lru, lru) { 3959 - list_del(&page->lru); 3960 - __free_page(page); 3961 - } 3962 - } 3963 - } 3685 + return swap_dup_entries_cluster(si, swp_offset(entry), 1); 3964 3686 } 3965 3687 3966 3688 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)

Configure Feed

Configure Feed