Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm, swap: implement helpers for reserving data in the swap table

To prepare for using the swap table as the unified swap layer, introduce
macros and helpers for storing multiple kinds of data in a swap table
entry.

From now on, we are storing PFN in the swap table to make space for extra
counting bits (SWAP_COUNT). Shadows are still stored as they are, as the
SWAP_COUNT is not used yet.

Also, rename shadow_swp_to_tb to shadow_to_swp_tb. That's a spelling
error, not really worth a separate fix.

No behaviour change yet, just prepare the API.

Link: https://lkml.kernel.org/r/20260218-swap-table-p3-v3-6-f4e34be021a7@tencent.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Acked-by: Chris Li <chrisl@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Kairui Song and committed by
Andrew Morton
62629ae4 f3d652b0

+125 -14
+3 -3
mm/swap_state.c
··· 148 148 VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); 149 149 VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); 150 150 151 - new_tb = folio_to_swp_tb(folio); 151 + new_tb = folio_to_swp_tb(folio, 0); 152 152 ci_start = swp_cluster_offset(entry); 153 153 ci_off = ci_start; 154 154 ci_end = ci_start + nr_pages; ··· 249 249 VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); 250 250 251 251 si = __swap_entry_to_info(entry); 252 - new_tb = shadow_swp_to_tb(shadow); 252 + new_tb = shadow_to_swp_tb(shadow, 0); 253 253 ci_start = swp_cluster_offset(entry); 254 254 ci_end = ci_start + nr_pages; 255 255 ci_off = ci_start; ··· 331 331 VM_WARN_ON_ONCE(!entry.val); 332 332 333 333 /* Swap cache still stores N entries instead of a high-order entry */ 334 - new_tb = folio_to_swp_tb(new); 334 + new_tb = folio_to_swp_tb(new, 0); 335 335 do { 336 336 old_tb = __swap_table_xchg(ci, ci_off, new_tb); 337 337 WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old);
+122 -11
mm/swap_table.h
··· 12 12 }; 13 13 14 14 #define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE) 15 - #define SWP_TB_COUNT_BITS 4 16 15 17 16 /* 18 17 * A swap table entry represents the status of a swap slot on a swap 19 18 * (physical or virtual) device. The swap table in each cluster is a 20 19 * 1:1 map of the swap slots in this cluster. 21 20 * 22 - * Each swap table entry could be a pointer (folio), a XA_VALUE 23 - * (shadow), or NULL. 21 + * Swap table entry type and bits layouts: 22 + * 23 + * NULL: |---------------- 0 ---------------| - Free slot 24 + * Shadow: | SWAP_COUNT |---- SHADOW_VAL ---|1| - Swapped out slot 25 + * PFN: | SWAP_COUNT |------ PFN -------|10| - Cached slot 26 + * Pointer: |----------- Pointer ----------|100| - (Unused) 27 + * Bad: |------------- 1 -------------|1000| - Bad slot 28 + * 29 + * SWAP_COUNT is `SWP_TB_COUNT_BITS` long, each entry is an atomic long. 30 + * 31 + * Usages: 32 + * 33 + * - NULL: Swap slot is unused, could be allocated. 34 + * 35 + * - Shadow: Swap slot is used and not cached (usually swapped out). It reuses 36 + * the XA_VALUE format to be compatible with working set shadows. SHADOW_VAL 37 + * part might be all 0 if the working shadow info is absent. In such a case, 38 + * we still want to keep the shadow format as a placeholder. 39 + * 40 + * Memcg ID is embedded in SHADOW_VAL. 41 + * 42 + * - PFN: Swap slot is in use, and cached. Memcg info is recorded on the page 43 + * struct. 44 + * 45 + * - Pointer: Unused yet. `0b100` is reserved for potential pointer usage 46 + * because only the lower three bits can be used as a marker for 8 bytes 47 + * aligned pointers. 48 + * 49 + * - Bad: Swap slot is reserved, protects swap header or holes on swap devices. 24 50 */ 51 + 52 + #if defined(MAX_POSSIBLE_PHYSMEM_BITS) 53 + #define SWAP_CACHE_PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) 54 + #elif defined(MAX_PHYSMEM_BITS) 55 + #define SWAP_CACHE_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) 56 + #else 57 + #define SWAP_CACHE_PFN_BITS (BITS_PER_LONG - PAGE_SHIFT) 58 + #endif 59 + 60 + /* NULL Entry, all 0 */ 61 + #define SWP_TB_NULL 0UL 62 + 63 + /* Swapped out: shadow */ 64 + #define SWP_TB_SHADOW_MARK 0b1UL 65 + 66 + /* Cached: PFN */ 67 + #define SWP_TB_PFN_BITS (SWAP_CACHE_PFN_BITS + SWP_TB_PFN_MARK_BITS) 68 + #define SWP_TB_PFN_MARK 0b10UL 69 + #define SWP_TB_PFN_MARK_BITS 2 70 + #define SWP_TB_PFN_MARK_MASK (BIT(SWP_TB_PFN_MARK_BITS) - 1) 71 + 72 + /* SWAP_COUNT part for PFN or shadow, the width can be shrunk or extended */ 73 + #define SWP_TB_COUNT_BITS min(4, BITS_PER_LONG - SWP_TB_PFN_BITS) 74 + #define SWP_TB_COUNT_MASK (~((~0UL) >> SWP_TB_COUNT_BITS)) 75 + #define SWP_TB_COUNT_SHIFT (BITS_PER_LONG - SWP_TB_COUNT_BITS) 76 + #define SWP_TB_COUNT_MAX ((1 << SWP_TB_COUNT_BITS) - 1) 77 + 78 + /* Bad slot: ends with 0b1000 and rests of bits are all 1 */ 79 + #define SWP_TB_BAD ((~0UL) << 3) 25 80 26 81 /* Macro for shadow offset calculation */ 27 82 #define SWAP_COUNT_SHIFT SWP_TB_COUNT_BITS ··· 90 35 return 0; 91 36 } 92 37 93 - static inline unsigned long folio_to_swp_tb(struct folio *folio) 38 + static inline unsigned long __count_to_swp_tb(unsigned char count) 94 39 { 95 - BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *)); 96 - return (unsigned long)folio; 40 + /* 41 + * At least three values are needed to distinguish free (0), 42 + * used (count > 0 && count < SWP_TB_COUNT_MAX), and 43 + * overflow (count == SWP_TB_COUNT_MAX). 44 + */ 45 + BUILD_BUG_ON(SWP_TB_COUNT_MAX < 2 || SWP_TB_COUNT_BITS < 2); 46 + VM_WARN_ON(count > SWP_TB_COUNT_MAX); 47 + return ((unsigned long)count) << SWP_TB_COUNT_SHIFT; 97 48 } 98 49 99 - static inline unsigned long shadow_swp_to_tb(void *shadow) 50 + static inline unsigned long pfn_to_swp_tb(unsigned long pfn, unsigned int count) 51 + { 52 + unsigned long swp_tb; 53 + 54 + BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *)); 55 + BUILD_BUG_ON(SWAP_CACHE_PFN_BITS > 56 + (BITS_PER_LONG - SWP_TB_PFN_MARK_BITS - SWP_TB_COUNT_BITS)); 57 + 58 + swp_tb = (pfn << SWP_TB_PFN_MARK_BITS) | SWP_TB_PFN_MARK; 59 + VM_WARN_ON_ONCE(swp_tb & SWP_TB_COUNT_MASK); 60 + 61 + return swp_tb | __count_to_swp_tb(count); 62 + } 63 + 64 + static inline unsigned long folio_to_swp_tb(struct folio *folio, unsigned int count) 65 + { 66 + return pfn_to_swp_tb(folio_pfn(folio), count); 67 + } 68 + 69 + static inline unsigned long shadow_to_swp_tb(void *shadow, unsigned int count) 100 70 { 101 71 BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) != 102 72 BITS_PER_BYTE * sizeof(unsigned long)); 73 + BUILD_BUG_ON((unsigned long)xa_mk_value(0) != SWP_TB_SHADOW_MARK); 74 + 103 75 VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow)); 104 - return (unsigned long)shadow; 76 + VM_WARN_ON_ONCE(shadow && ((unsigned long)shadow & SWP_TB_COUNT_MASK)); 77 + 78 + return (unsigned long)shadow | __count_to_swp_tb(count) | SWP_TB_SHADOW_MARK; 105 79 } 106 80 107 81 /* ··· 143 59 144 60 static inline bool swp_tb_is_folio(unsigned long swp_tb) 145 61 { 146 - return !xa_is_value((void *)swp_tb) && !swp_tb_is_null(swp_tb); 62 + return ((swp_tb & SWP_TB_PFN_MARK_MASK) == SWP_TB_PFN_MARK); 147 63 } 148 64 149 65 static inline bool swp_tb_is_shadow(unsigned long swp_tb) 150 66 { 151 67 return xa_is_value((void *)swp_tb); 68 + } 69 + 70 + static inline bool swp_tb_is_bad(unsigned long swp_tb) 71 + { 72 + return swp_tb == SWP_TB_BAD; 73 + } 74 + 75 + static inline bool swp_tb_is_countable(unsigned long swp_tb) 76 + { 77 + return (swp_tb_is_shadow(swp_tb) || swp_tb_is_folio(swp_tb) || 78 + swp_tb_is_null(swp_tb)); 152 79 } 153 80 154 81 /* ··· 168 73 static inline struct folio *swp_tb_to_folio(unsigned long swp_tb) 169 74 { 170 75 VM_WARN_ON(!swp_tb_is_folio(swp_tb)); 171 - return (void *)swp_tb; 76 + return pfn_folio((swp_tb & ~SWP_TB_COUNT_MASK) >> SWP_TB_PFN_MARK_BITS); 172 77 } 173 78 174 79 static inline void *swp_tb_to_shadow(unsigned long swp_tb) 175 80 { 176 81 VM_WARN_ON(!swp_tb_is_shadow(swp_tb)); 177 - return (void *)swp_tb; 82 + /* No shift needed, xa_value is stored as it is in the lower bits. */ 83 + return (void *)(swp_tb & ~SWP_TB_COUNT_MASK); 84 + } 85 + 86 + static inline unsigned char __swp_tb_get_count(unsigned long swp_tb) 87 + { 88 + VM_WARN_ON(!swp_tb_is_countable(swp_tb)); 89 + return ((swp_tb & SWP_TB_COUNT_MASK) >> SWP_TB_COUNT_SHIFT); 90 + } 91 + 92 + static inline int swp_tb_get_count(unsigned long swp_tb) 93 + { 94 + if (swp_tb_is_countable(swp_tb)) 95 + return __swp_tb_get_count(swp_tb); 96 + return -EINVAL; 178 97 } 179 98 180 99 /* ··· 232 123 { 233 124 atomic_long_t *table; 234 125 unsigned long swp_tb; 126 + 127 + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); 235 128 236 129 rcu_read_lock(); 237 130 table = rcu_dereference(ci->table);