Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm: memcontrol: prepare for reparenting LRU pages for lruvec lock

The following diagram illustrates how to ensure the safety of the folio
lruvec lock when LRU folios undergo reparenting.

In the folio_lruvec_lock(folio) function:

rcu_read_lock();
retry:
lruvec = folio_lruvec(folio);
/* There is a possibility of folio reparenting at this point. */
spin_lock(&lruvec->lru_lock);
if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) {
/*
* The wrong lruvec lock was acquired, and a retry is required.
* This is because the folio resides on the parent memcg lruvec
* list.
*/
spin_unlock(&lruvec->lru_lock);
goto retry;
}

/* Reaching here indicates that folio_memcg() is stable. */


In the memcg_reparent_objcgs(memcg) function:

spin_lock(&lruvec->lru_lock);
spin_lock(&lruvec_parent->lru_lock);
/* Transfer folios from the lruvec list to the parent's. */
spin_unlock(&lruvec_parent->lru_lock);
spin_unlock(&lruvec->lru_lock);

After acquiring the lruvec lock, it is necessary to verify whether the
folio has been reparented. If reparenting has occurred, the new lruvec
lock must be reacquired. During the LRU folio reparenting process, the
lruvec lock will also be acquired (this will be implemented in a
subsequent patch). Therefore, folio_memcg() remains unchanged while the
lruvec lock is held.

Given that lruvec_memcg(lruvec) is always equal to folio_memcg(folio)
after the lruvec lock is acquired, the lruvec_memcg_debug() check is
redundant. Hence, it is removed.

This patch serves as a preparation for the reparenting of LRU folios.

Link: https://lore.kernel.org/23f22cbb1419f277a3483018b32158ae2b86c666.1772711148.git.zhengqi.arch@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Allen Pais <apais@linux.microsoft.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Imran Khan <imran.f.khan@oracle.com>
Cc: Kamalesh Babulal <kamalesh.babulal@oracle.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vlastimil Babka <vbabka@kernel.org>
Cc: Wei Xu <weixugc@google.com>
Cc: Yosry Ahmed <yosry@kernel.org>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Muchun Song and committed by
Andrew Morton
31b54a5e d14f8785

+73 -52
+17 -17
include/linux/memcontrol.h
··· 741 741 * folio_lruvec - return lruvec for isolating/putting an LRU folio 742 742 * @folio: Pointer to the folio. 743 743 * 744 - * This function relies on folio->mem_cgroup being stable. 744 + * Call with rcu_read_lock() held to ensure the lifetime of the returned lruvec. 745 + * Note that this alone will NOT guarantee the stability of the folio->lruvec 746 + * association; the folio can be reparented to an ancestor if this races with 747 + * cgroup deletion. 748 + * 749 + * Use folio_lruvec_lock() to ensure both lifetime and stability of the binding. 750 + * Once a lruvec is locked, folio_lruvec() can be called on other folios, and 751 + * their binding is stable if the returned lruvec matches the one the caller has 752 + * locked. Useful for lock batching. 745 753 */ 746 754 static inline struct lruvec *folio_lruvec(struct folio *folio) 747 755 { ··· 771 763 struct lruvec *folio_lruvec_lock_irq(struct folio *folio); 772 764 struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, 773 765 unsigned long *flags); 774 - 775 - #ifdef CONFIG_DEBUG_VM 776 - void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio); 777 - #else 778 - static inline 779 - void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) 780 - { 781 - } 782 - #endif 783 766 784 767 static inline 785 768 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ ··· 1197 1198 return &pgdat->__lruvec; 1198 1199 } 1199 1200 1200 - static inline 1201 - void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) 1202 - { 1203 - } 1204 - 1205 1201 static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 1206 1202 { 1207 1203 return NULL; ··· 1255 1261 { 1256 1262 struct pglist_data *pgdat = folio_pgdat(folio); 1257 1263 1264 + rcu_read_lock(); 1258 1265 spin_lock(&pgdat->__lruvec.lru_lock); 1259 1266 return &pgdat->__lruvec; 1260 1267 } ··· 1264 1269 { 1265 1270 struct pglist_data *pgdat = folio_pgdat(folio); 1266 1271 1272 + rcu_read_lock(); 1267 1273 spin_lock_irq(&pgdat->__lruvec.lru_lock); 1268 1274 return &pgdat->__lruvec; 1269 1275 } ··· 1274 1278 { 1275 1279 struct pglist_data *pgdat = folio_pgdat(folio); 1276 1280 1281 + rcu_read_lock(); 1277 1282 spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp); 1278 1283 return &pgdat->__lruvec; 1279 1284 } ··· 1497 1500 1498 1501 static inline void lruvec_lock_irq(struct lruvec *lruvec) 1499 1502 { 1503 + rcu_read_lock(); 1500 1504 spin_lock_irq(&lruvec->lru_lock); 1501 1505 } 1502 1506 1503 1507 static inline void lruvec_unlock(struct lruvec *lruvec) 1504 1508 { 1505 1509 spin_unlock(&lruvec->lru_lock); 1510 + rcu_read_unlock(); 1506 1511 } 1507 1512 1508 1513 static inline void lruvec_unlock_irq(struct lruvec *lruvec) 1509 1514 { 1510 1515 spin_unlock_irq(&lruvec->lru_lock); 1516 + rcu_read_unlock(); 1511 1517 } 1512 1518 1513 - static inline void lruvec_unlock_irqrestore(struct lruvec *lruvec, 1514 - unsigned long flags) 1519 + static inline void lruvec_unlock_irqrestore(struct lruvec *lruvec, unsigned long flags) 1515 1520 { 1516 1521 spin_unlock_irqrestore(&lruvec->lru_lock, flags); 1522 + rcu_read_unlock(); 1517 1523 } 1518 1524 1519 1525 /* Test requires a stable folio->memcg binding, see folio_memcg() */
+1 -2
include/linux/swap.h
··· 310 310 311 311 /* linux/mm/swap.c */ 312 312 void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, 313 - unsigned int nr_io, unsigned int nr_rotated) 314 - __releases(lruvec->lru_lock); 313 + unsigned int nr_io, unsigned int nr_rotated); 315 314 void lru_note_cost_refault(struct folio *); 316 315 void folio_add_lru(struct folio *); 317 316 void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
+23 -6
mm/compaction.c
··· 518 518 return true; 519 519 } 520 520 521 + static struct lruvec * 522 + compact_folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags, 523 + struct compact_control *cc) 524 + { 525 + struct lruvec *lruvec; 526 + 527 + rcu_read_lock(); 528 + retry: 529 + lruvec = folio_lruvec(folio); 530 + compact_lock_irqsave(&lruvec->lru_lock, flags, cc); 531 + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { 532 + spin_unlock_irqrestore(&lruvec->lru_lock, *flags); 533 + goto retry; 534 + } 535 + 536 + return lruvec; 537 + } 538 + 521 539 /* 522 540 * Compaction requires the taking of some coarse locks that are potentially 523 541 * very heavily contended. The lock should be periodically unlocked to avoid ··· 857 839 { 858 840 pg_data_t *pgdat = cc->zone->zone_pgdat; 859 841 unsigned long nr_scanned = 0, nr_isolated = 0; 860 - struct lruvec *lruvec; 842 + struct lruvec *lruvec = NULL; 861 843 unsigned long flags = 0; 862 844 struct lruvec *locked = NULL; 863 845 struct folio *folio = NULL; ··· 1171 1153 if (!folio_test_clear_lru(folio)) 1172 1154 goto isolate_fail_put; 1173 1155 1174 - lruvec = folio_lruvec(folio); 1156 + if (locked) 1157 + lruvec = folio_lruvec(folio); 1175 1158 1176 1159 /* If we already hold the lock, we can skip some rechecking */ 1177 - if (lruvec != locked) { 1160 + if (lruvec != locked || !locked) { 1178 1161 if (locked) 1179 1162 lruvec_unlock_irqrestore(locked, flags); 1180 1163 1181 - compact_lock_irqsave(&lruvec->lru_lock, &flags, cc); 1164 + lruvec = compact_folio_lruvec_lock_irqsave(folio, &flags, cc); 1182 1165 locked = lruvec; 1183 - 1184 - lruvec_memcg_debug(lruvec, folio); 1185 1166 1186 1167 /* 1187 1168 * Try get exclusive access under lock. If marked for
+27 -26
mm/memcontrol.c
··· 1206 1206 } 1207 1207 } 1208 1208 1209 - #ifdef CONFIG_DEBUG_VM 1210 - void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) 1211 - { 1212 - struct mem_cgroup *memcg; 1213 - 1214 - if (mem_cgroup_disabled()) 1215 - return; 1216 - 1217 - memcg = folio_memcg(folio); 1218 - 1219 - if (!memcg) 1220 - VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio); 1221 - else 1222 - VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio); 1223 - } 1224 - #endif 1225 - 1226 1209 /** 1227 1210 * folio_lruvec_lock - Lock the lruvec for a folio. 1228 1211 * @folio: Pointer to the folio. ··· 1215 1232 * - folio_test_lru false 1216 1233 * - folio frozen (refcount of 0) 1217 1234 * 1218 - * Return: The lruvec this folio is on with its lock held. 1235 + * Return: The lruvec this folio is on with its lock held and rcu read lock held. 1219 1236 */ 1220 1237 struct lruvec *folio_lruvec_lock(struct folio *folio) 1221 1238 { 1222 - struct lruvec *lruvec = folio_lruvec(folio); 1239 + struct lruvec *lruvec; 1223 1240 1241 + rcu_read_lock(); 1242 + retry: 1243 + lruvec = folio_lruvec(folio); 1224 1244 spin_lock(&lruvec->lru_lock); 1225 - lruvec_memcg_debug(lruvec, folio); 1245 + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { 1246 + spin_unlock(&lruvec->lru_lock); 1247 + goto retry; 1248 + } 1226 1249 1227 1250 return lruvec; 1228 1251 } ··· 1243 1254 * - folio frozen (refcount of 0) 1244 1255 * 1245 1256 * Return: The lruvec this folio is on with its lock held and interrupts 1246 - * disabled. 1257 + * disabled and rcu read lock held. 1247 1258 */ 1248 1259 struct lruvec *folio_lruvec_lock_irq(struct folio *folio) 1249 1260 { 1250 - struct lruvec *lruvec = folio_lruvec(folio); 1261 + struct lruvec *lruvec; 1251 1262 1263 + rcu_read_lock(); 1264 + retry: 1265 + lruvec = folio_lruvec(folio); 1252 1266 spin_lock_irq(&lruvec->lru_lock); 1253 - lruvec_memcg_debug(lruvec, folio); 1267 + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { 1268 + spin_unlock_irq(&lruvec->lru_lock); 1269 + goto retry; 1270 + } 1254 1271 1255 1272 return lruvec; 1256 1273 } ··· 1272 1277 * - folio frozen (refcount of 0) 1273 1278 * 1274 1279 * Return: The lruvec this folio is on with its lock held and interrupts 1275 - * disabled. 1280 + * disabled and rcu read lock held. 1276 1281 */ 1277 1282 struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, 1278 1283 unsigned long *flags) 1279 1284 { 1280 - struct lruvec *lruvec = folio_lruvec(folio); 1285 + struct lruvec *lruvec; 1281 1286 1287 + rcu_read_lock(); 1288 + retry: 1289 + lruvec = folio_lruvec(folio); 1282 1290 spin_lock_irqsave(&lruvec->lru_lock, *flags); 1283 - lruvec_memcg_debug(lruvec, folio); 1291 + if (unlikely(lruvec_memcg(lruvec) != folio_memcg(folio))) { 1292 + spin_unlock_irqrestore(&lruvec->lru_lock, *flags); 1293 + goto retry; 1294 + } 1284 1295 1285 1296 return lruvec; 1286 1297 }
+5 -1
mm/swap.c
··· 240 240 void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, 241 241 unsigned int nr_io, unsigned int nr_rotated) 242 242 __releases(lruvec->lru_lock) 243 + __releases(rcu) 243 244 { 244 245 unsigned long cost; 245 246 ··· 254 253 cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; 255 254 if (!cost) { 256 255 spin_unlock_irq(&lruvec->lru_lock); 256 + rcu_read_unlock(); 257 257 return; 258 258 } 259 259 ··· 287 285 288 286 spin_unlock_irq(&lruvec->lru_lock); 289 287 lruvec = parent_lruvec(lruvec); 290 - if (!lruvec) 288 + if (!lruvec) { 289 + rcu_read_unlock(); 291 290 break; 291 + } 292 292 spin_lock_irq(&lruvec->lru_lock); 293 293 } 294 294 }