Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm/workingset: leave highest bits empty for anon shadow

Swap table entry will need 4 bits reserved for swap count in the shadow,
so the anon shadow should have its leading 4 bits remain 0.

This should be OK for the foreseeable future. Take 52 bits of physical
address space as an example: for 4K pages, there would be at most 40 bits
for addressable pages. Currently, we have 36 bits available (64 - 1 - 16
- 10 - 1, where XA_VALUE takes 1 bit for marker, MEM_CGROUP_ID_SHIFT takes
16 bits, NODES_SHIFT takes <=10 bits, WORKINGSET flags takes 1 bit).

So in the worst case, we previously need to pack the 40 bits of address in
36 bits fields using a 64K bucket (bucket_order = 4). After this, the
bucket will be increased to 1M. Which should be fine, as on such large
machines, the working set size will be way larger than the bucket size.

And for MGLRU's gen number tracking, it should be even more than enough,
MGLRU's gen number (max_seq) increment is much slower compared to the
eviction counter (nonresident_age).

And after all, either the refault distance or the gen distance is only a
hint that can tolerate inaccuracy just fine.

And the 4 bits can be shrunk to 3, or extended to a higher value if needed
later.

Link: https://lkml.kernel.org/r/20260218-swap-table-p3-v3-5-f4e34be021a7@tencent.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Acked-by: Chris Li <chrisl@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Kairui Song and committed by
Andrew Morton
f3d652b0 0c7e6014

+34 -19
+4
mm/swap_table.h
··· 12 12 }; 13 13 14 14 #define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE) 15 + #define SWP_TB_COUNT_BITS 4 15 16 16 17 /* 17 18 * A swap table entry represents the status of a swap slot on a swap ··· 22 21 * Each swap table entry could be a pointer (folio), a XA_VALUE 23 22 * (shadow), or NULL. 24 23 */ 24 + 25 + /* Macro for shadow offset calculation */ 26 + #define SWAP_COUNT_SHIFT SWP_TB_COUNT_BITS 25 27 26 28 /* 27 29 * Helpers for casting one type of info into a swap table entry.
+30 -19
mm/workingset.c
··· 16 16 #include <linux/dax.h> 17 17 #include <linux/fs.h> 18 18 #include <linux/mm.h> 19 + #include "swap_table.h" 19 20 #include "internal.h" 20 21 21 22 /* ··· 185 184 #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ 186 185 WORKINGSET_SHIFT + NODES_SHIFT + \ 187 186 MEM_CGROUP_ID_SHIFT) 187 + #define EVICTION_SHIFT_ANON (EVICTION_SHIFT + SWAP_COUNT_SHIFT) 188 188 #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) 189 + #define EVICTION_MASK_ANON (~0UL >> EVICTION_SHIFT_ANON) 189 190 190 191 /* 191 192 * Eviction timestamps need to be able to cover the full range of ··· 197 194 * that case, we have to sacrifice granularity for distance, and group 198 195 * evictions into coarser buckets by shaving off lower timestamp bits. 199 196 */ 200 - static unsigned int bucket_order __read_mostly; 197 + static unsigned int bucket_order[ANON_AND_FILE] __read_mostly; 201 198 202 199 static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, 203 - bool workingset) 200 + bool workingset, bool file) 204 201 { 205 - eviction &= EVICTION_MASK; 202 + eviction &= file ? EVICTION_MASK : EVICTION_MASK_ANON; 206 203 eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; 207 204 eviction = (eviction << NODES_SHIFT) | pgdat->node_id; 208 205 eviction = (eviction << WORKINGSET_SHIFT) | workingset; ··· 247 244 struct mem_cgroup *memcg = folio_memcg(folio); 248 245 struct pglist_data *pgdat = folio_pgdat(folio); 249 246 250 - BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); 247 + BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > 248 + BITS_PER_LONG - max(EVICTION_SHIFT, EVICTION_SHIFT_ANON)); 251 249 252 250 lruvec = mem_cgroup_lruvec(memcg, pgdat); 253 251 lrugen = &lruvec->lrugen; ··· 258 254 hist = lru_hist_from_seq(min_seq); 259 255 atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); 260 256 261 - return pack_shadow(mem_cgroup_private_id(memcg), pgdat, token, workingset); 257 + return pack_shadow(mem_cgroup_private_id(memcg), pgdat, token, workingset, type); 262 258 } 263 259 264 260 /* ··· 266 262 * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. 267 263 */ 268 264 static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, 269 - unsigned long *token, bool *workingset) 265 + unsigned long *token, bool *workingset, bool file) 270 266 { 271 267 int memcg_id; 272 268 unsigned long max_seq; ··· 279 275 *lruvec = mem_cgroup_lruvec(memcg, pgdat); 280 276 281 277 max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); 282 - max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH; 278 + max_seq &= (file ? EVICTION_MASK : EVICTION_MASK_ANON) >> LRU_REFS_WIDTH; 283 279 284 280 return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS; 285 281 } ··· 297 293 298 294 rcu_read_lock(); 299 295 300 - recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset); 296 + recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset, type); 301 297 if (lruvec != folio_lruvec(folio)) 302 298 goto unlock; 303 299 ··· 335 331 } 336 332 337 333 static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, 338 - unsigned long *token, bool *workingset) 334 + unsigned long *token, bool *workingset, bool file) 339 335 { 340 336 return false; 341 337 } ··· 385 381 void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) 386 382 { 387 383 struct pglist_data *pgdat = folio_pgdat(folio); 384 + int file = folio_is_file_lru(folio); 388 385 unsigned long eviction; 389 386 struct lruvec *lruvec; 390 387 int memcgid; ··· 402 397 /* XXX: target_memcg can be NULL, go through lruvec */ 403 398 memcgid = mem_cgroup_private_id(lruvec_memcg(lruvec)); 404 399 eviction = atomic_long_read(&lruvec->nonresident_age); 405 - eviction >>= bucket_order; 400 + eviction >>= bucket_order[file]; 406 401 workingset_age_nonresident(lruvec, folio_nr_pages(folio)); 407 402 return pack_shadow(memcgid, pgdat, eviction, 408 - folio_test_workingset(folio)); 403 + folio_test_workingset(folio), file); 409 404 } 410 405 411 406 /** ··· 436 431 bool recent; 437 432 438 433 rcu_read_lock(); 439 - recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset); 434 + recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, 435 + workingset, file); 440 436 rcu_read_unlock(); 441 437 return recent; 442 438 } 443 439 444 440 rcu_read_lock(); 445 441 unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); 446 - eviction <<= bucket_order; 442 + eviction <<= bucket_order[file]; 447 443 448 444 /* 449 445 * Look up the memcg associated with the stored ID. It might ··· 501 495 * longest time, so the occasional inappropriate activation 502 496 * leading to pressure on the active list is not a problem. 503 497 */ 504 - refault_distance = (refault - eviction) & EVICTION_MASK; 498 + refault_distance = ((refault - eviction) & 499 + (file ? EVICTION_MASK : EVICTION_MASK_ANON)); 505 500 506 501 /* 507 502 * Compare the distance to the existing workingset size. We ··· 787 780 788 781 static int __init workingset_init(void) 789 782 { 783 + unsigned int timestamp_bits, timestamp_bits_anon; 790 784 struct shrinker *workingset_shadow_shrinker; 791 - unsigned int timestamp_bits; 792 785 unsigned int max_order; 793 786 int ret = -ENOMEM; 794 787 ··· 801 794 * double the initial memory by using totalram_pages as-is. 802 795 */ 803 796 timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; 797 + timestamp_bits_anon = BITS_PER_LONG - EVICTION_SHIFT_ANON; 804 798 max_order = fls_long(totalram_pages() - 1); 805 - if (max_order > timestamp_bits) 806 - bucket_order = max_order - timestamp_bits; 807 - pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", 808 - timestamp_bits, max_order, bucket_order); 799 + if (max_order > (BITS_PER_LONG - EVICTION_SHIFT)) 800 + bucket_order[WORKINGSET_FILE] = max_order - timestamp_bits; 801 + if (max_order > timestamp_bits_anon) 802 + bucket_order[WORKINGSET_ANON] = max_order - timestamp_bits_anon; 803 + pr_info("workingset: timestamp_bits=%d (anon: %d) max_order=%d bucket_order=%u (anon: %d)\n", 804 + timestamp_bits, timestamp_bits_anon, max_order, 805 + bucket_order[WORKINGSET_FILE], bucket_order[WORKINGSET_ANON]); 809 806 810 807 workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | 811 808 SHRINKER_MEMCG_AWARE,