Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'mm-hotfixes-stable-2024-07-10-13-19' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull misc fixes from Andrew Morton:
"21 hotfixes, 15 of which are cc:stable.

No identifiable theme here - all are singleton patches, 19 are for MM"

* tag 'mm-hotfixes-stable-2024-07-10-13-19' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (21 commits)
mm/hugetlb: fix kernel NULL pointer dereference when migrating hugetlb folio
mm/hugetlb: fix potential race in __update_and_free_hugetlb_folio()
filemap: replace pte_offset_map() with pte_offset_map_nolock()
arch/xtensa: always_inline get_current() and current_thread_info()
sched.h: always_inline alloc_tag_{save|restore} to fix modpost warnings
MAINTAINERS: mailmap: update Lorenzo Stoakes's email address
mm: fix crashes from deferred split racing folio migration
lib/build_OID_registry: avoid non-destructive substitution for Perl < 5.13.2 compat
mm: gup: stop abusing try_grab_folio
nilfs2: fix kernel bug on rename operation of broken directory
mm/hugetlb_vmemmap: fix race with speculative PFN walkers
cachestat: do not flush stats in recency check
mm/shmem: disable PMD-sized page cache if needed
mm/filemap: skip to create PMD-sized page cache if needed
mm/readahead: limit page cache size in page_cache_ra_order()
mm/filemap: make MAX_PAGECACHE_ORDER acceptable to xarray
mm/damon/core: merge regions aggressively when max_nr_regions is unmet
Fix userfaultfd_api to return EINVAL as expected
mm: vmalloc: check if a hash-index is in cpu_possible_mask
mm: prevent derefencing NULL ptr in pfn_section_valid()
...

+338 -285
+1
.mailmap
··· 384 384 Li Yang <leoyang.li@nxp.com> <leo@zh-kernel.org> 385 385 Lior David <quic_liord@quicinc.com> <liord@codeaurora.org> 386 386 Lorenzo Pieralisi <lpieralisi@kernel.org> <lorenzo.pieralisi@arm.com> 387 + Lorenzo Stoakes <lorenzo.stoakes@oracle.com> <lstoakes@gmail.com> 387 388 Luca Ceresoli <luca.ceresoli@bootlin.com> <luca@lucaceresoli.net> 388 389 Lukasz Luba <lukasz.luba@arm.com> <l.luba@partner.samsung.com> 389 390 Luo Jie <quic_luoj@quicinc.com> <luoj@codeaurora.org>
+1 -1
MAINTAINERS
··· 14472 14472 M: Andrew Morton <akpm@linux-foundation.org> 14473 14473 R: Liam R. Howlett <Liam.Howlett@oracle.com> 14474 14474 R: Vlastimil Babka <vbabka@suse.cz> 14475 - R: Lorenzo Stoakes <lstoakes@gmail.com> 14475 + R: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> 14476 14476 L: linux-mm@kvack.org 14477 14477 S: Maintained 14478 14478 W: http://www.linux-mm.org
+1 -1
arch/xtensa/include/asm/current.h
··· 19 19 20 20 struct task_struct; 21 21 22 - static inline struct task_struct *get_current(void) 22 + static __always_inline struct task_struct *get_current(void) 23 23 { 24 24 return current_thread_info()->task; 25 25 }
+1 -1
arch/xtensa/include/asm/thread_info.h
··· 91 91 } 92 92 93 93 /* how to get the thread information struct from C */ 94 - static inline struct thread_info *current_thread_info(void) 94 + static __always_inline struct thread_info *current_thread_info(void) 95 95 { 96 96 struct thread_info *ti; 97 97 __asm__("extui %0, a1, 0, "__stringify(CURRENT_SHIFT)"\n\t"
+30 -2
fs/nilfs2/dir.c
··· 383 383 384 384 struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct folio **foliop) 385 385 { 386 - struct nilfs_dir_entry *de = nilfs_get_folio(dir, 0, foliop); 386 + struct folio *folio; 387 + struct nilfs_dir_entry *de, *next_de; 388 + size_t limit; 389 + char *msg; 387 390 391 + de = nilfs_get_folio(dir, 0, &folio); 388 392 if (IS_ERR(de)) 389 393 return NULL; 390 - return nilfs_next_entry(de); 394 + 395 + limit = nilfs_last_byte(dir, 0); /* is a multiple of chunk size */ 396 + if (unlikely(!limit || le64_to_cpu(de->inode) != dir->i_ino || 397 + !nilfs_match(1, ".", de))) { 398 + msg = "missing '.'"; 399 + goto fail; 400 + } 401 + 402 + next_de = nilfs_next_entry(de); 403 + /* 404 + * If "next_de" has not reached the end of the chunk, there is 405 + * at least one more record. Check whether it matches "..". 406 + */ 407 + if (unlikely((char *)next_de == (char *)de + nilfs_chunk_size(dir) || 408 + !nilfs_match(2, "..", next_de))) { 409 + msg = "missing '..'"; 410 + goto fail; 411 + } 412 + *foliop = folio; 413 + return next_de; 414 + 415 + fail: 416 + nilfs_error(dir->i_sb, "directory #%lu %s", dir->i_ino, msg); 417 + folio_release_kmap(folio, de); 418 + return NULL; 391 419 } 392 420 393 421 ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
+6 -1
fs/userfaultfd.c
··· 2057 2057 goto out; 2058 2058 features = uffdio_api.features; 2059 2059 ret = -EINVAL; 2060 - if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) 2060 + if (uffdio_api.api != UFFD_API) 2061 2061 goto err_out; 2062 2062 ret = -EPERM; 2063 2063 if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) ··· 2081 2081 uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; 2082 2082 uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; 2083 2083 #endif 2084 + 2085 + ret = -EINVAL; 2086 + if (features & ~uffdio_api.features) 2087 + goto err_out; 2088 + 2084 2089 uffdio_api.ioctls = UFFD_API_IOCTLS; 2085 2090 ret = -EFAULT; 2086 2091 if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+2 -1
include/linux/mmzone.h
··· 1979 1979 static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) 1980 1980 { 1981 1981 int idx = subsection_map_index(pfn); 1982 + struct mem_section_usage *usage = READ_ONCE(ms->usage); 1982 1983 1983 - return test_bit(idx, READ_ONCE(ms->usage)->subsection_map); 1984 + return usage ? test_bit(idx, usage->subsection_map) : 0; 1984 1985 } 1985 1986 #else 1986 1987 static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
+9 -48
include/linux/page_ref.h
··· 230 230 231 231 static inline bool page_ref_add_unless(struct page *page, int nr, int u) 232 232 { 233 - bool ret = atomic_add_unless(&page->_refcount, nr, u); 233 + bool ret = false; 234 + 235 + rcu_read_lock(); 236 + /* avoid writing to the vmemmap area being remapped */ 237 + if (!page_is_fake_head(page) && page_ref_count(page) != u) 238 + ret = atomic_add_unless(&page->_refcount, nr, u); 239 + rcu_read_unlock(); 234 240 235 241 if (page_ref_tracepoint_active(page_ref_mod_unless)) 236 242 __page_ref_mod_unless(page, nr, ret); ··· 264 258 return folio_ref_add_unless(folio, 1, 0); 265 259 } 266 260 267 - static inline bool folio_ref_try_add_rcu(struct folio *folio, int count) 261 + static inline bool folio_ref_try_add(struct folio *folio, int count) 268 262 { 269 - #ifdef CONFIG_TINY_RCU 270 - /* 271 - * The caller guarantees the folio will not be freed from interrupt 272 - * context, so (on !SMP) we only need preemption to be disabled 273 - * and TINY_RCU does that for us. 274 - */ 275 - # ifdef CONFIG_PREEMPT_COUNT 276 - VM_BUG_ON(!in_atomic() && !irqs_disabled()); 277 - # endif 278 - VM_BUG_ON_FOLIO(folio_ref_count(folio) == 0, folio); 279 - folio_ref_add(folio, count); 280 - #else 281 - if (unlikely(!folio_ref_add_unless(folio, count, 0))) { 282 - /* Either the folio has been freed, or will be freed. */ 283 - return false; 284 - } 285 - #endif 286 - return true; 287 - } 288 - 289 - /** 290 - * folio_try_get_rcu - Attempt to increase the refcount on a folio. 291 - * @folio: The folio. 292 - * 293 - * This is a version of folio_try_get() optimised for non-SMP kernels. 294 - * If you are still holding the rcu_read_lock() after looking up the 295 - * page and know that the page cannot have its refcount decreased to 296 - * zero in interrupt context, you can use this instead of folio_try_get(). 297 - * 298 - * Example users include get_user_pages_fast() (as pages are not unmapped 299 - * from interrupt context) and the page cache lookups (as pages are not 300 - * truncated from interrupt context). We also know that pages are not 301 - * frozen in interrupt context for the purposes of splitting or migration. 302 - * 303 - * You can also use this function if you're holding a lock that prevents 304 - * pages being frozen & removed; eg the i_pages lock for the page cache 305 - * or the mmap_lock or page table lock for page tables. In this case, 306 - * it will always succeed, and you could have used a plain folio_get(), 307 - * but it's sometimes more convenient to have a common function called 308 - * from both locked and RCU-protected contexts. 309 - * 310 - * Return: True if the reference count was successfully incremented. 311 - */ 312 - static inline bool folio_try_get_rcu(struct folio *folio) 313 - { 314 - return folio_ref_try_add_rcu(folio, 1); 263 + return folio_ref_add_unless(folio, count, 0); 315 264 } 316 265 317 266 static inline int page_ref_freeze(struct page *page, int count)
+9 -2
include/linux/pagemap.h
··· 354 354 * a good order (that's 1MB if you're using 4kB pages) 355 355 */ 356 356 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 357 - #define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER 357 + #define PREFERRED_MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER 358 358 #else 359 - #define MAX_PAGECACHE_ORDER 8 359 + #define PREFERRED_MAX_PAGECACHE_ORDER 8 360 360 #endif 361 + 362 + /* 363 + * xas_split_alloc() does not support arbitrary orders. This implies no 364 + * 512MB THP on ARM64 with 64KB base page size. 365 + */ 366 + #define MAX_XAS_ORDER (XA_CHUNK_SHIFT * 2 - 1) 367 + #define MAX_PAGECACHE_ORDER min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER) 361 368 362 369 /** 363 370 * mapping_set_large_folios() - Indicate the file supports large folios.
+2 -2
include/linux/sched.h
··· 2192 2192 extern void sched_set_stop_task(int cpu, struct task_struct *stop); 2193 2193 2194 2194 #ifdef CONFIG_MEM_ALLOC_PROFILING 2195 - static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) 2195 + static __always_inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) 2196 2196 { 2197 2197 swap(current->alloc_tag, tag); 2198 2198 return tag; 2199 2199 } 2200 2200 2201 - static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) 2201 + static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) 2202 2202 { 2203 2203 #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG 2204 2204 WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n");
+2 -1
include/linux/swap.h
··· 354 354 } 355 355 356 356 /* linux/mm/workingset.c */ 357 - bool workingset_test_recent(void *shadow, bool file, bool *workingset); 357 + bool workingset_test_recent(void *shadow, bool file, bool *workingset, 358 + bool flush); 358 359 void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); 359 360 void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg); 360 361 void workingset_refault(struct folio *folio, void *shadow);
+3 -1
lib/build_OID_registry
··· 38 38 # 39 39 open C_FILE, ">$ARGV[1]" or die; 40 40 print C_FILE "/*\n"; 41 - print C_FILE " * Automatically generated by ", $0 =~ s#^\Q$abs_srctree/\E##r, ". Do not edit\n"; 41 + my $scriptname = $0; 42 + $scriptname =~ s#^\Q$abs_srctree/\E##; 43 + print C_FILE " * Automatically generated by ", $scriptname, ". Do not edit\n"; 42 44 print C_FILE " */\n"; 43 45 44 46 #
+19 -2
mm/damon/core.c
··· 1358 1358 * access frequencies are similar. This is for minimizing the monitoring 1359 1359 * overhead under the dynamically changeable access pattern. If a merge was 1360 1360 * unnecessarily made, later 'kdamond_split_regions()' will revert it. 1361 + * 1362 + * The total number of regions could be higher than the user-defined limit, 1363 + * max_nr_regions for some cases. For example, the user can update 1364 + * max_nr_regions to a number that lower than the current number of regions 1365 + * while DAMON is running. For such a case, repeat merging until the limit is 1366 + * met while increasing @threshold up to possible maximum level. 1361 1367 */ 1362 1368 static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold, 1363 1369 unsigned long sz_limit) 1364 1370 { 1365 1371 struct damon_target *t; 1372 + unsigned int nr_regions; 1373 + unsigned int max_thres; 1366 1374 1367 - damon_for_each_target(t, c) 1368 - damon_merge_regions_of(t, threshold, sz_limit); 1375 + max_thres = c->attrs.aggr_interval / 1376 + (c->attrs.sample_interval ? c->attrs.sample_interval : 1); 1377 + do { 1378 + nr_regions = 0; 1379 + damon_for_each_target(t, c) { 1380 + damon_merge_regions_of(t, threshold, sz_limit); 1381 + nr_regions += damon_nr_regions(t); 1382 + } 1383 + threshold = max(1, threshold * 2); 1384 + } while (nr_regions > c->attrs.max_nr_regions && 1385 + threshold / 2 < max_thres); 1369 1386 } 1370 1387 1371 1388 /*
+12 -8
mm/filemap.c
··· 1847 1847 if (!folio || xa_is_value(folio)) 1848 1848 goto out; 1849 1849 1850 - if (!folio_try_get_rcu(folio)) 1850 + if (!folio_try_get(folio)) 1851 1851 goto repeat; 1852 1852 1853 1853 if (unlikely(folio != xas_reload(&xas))) { ··· 2001 2001 if (!folio || xa_is_value(folio)) 2002 2002 return folio; 2003 2003 2004 - if (!folio_try_get_rcu(folio)) 2004 + if (!folio_try_get(folio)) 2005 2005 goto reset; 2006 2006 2007 2007 if (unlikely(folio != xas_reload(xas))) { ··· 2181 2181 if (xa_is_value(folio)) 2182 2182 goto update_start; 2183 2183 2184 - if (!folio_try_get_rcu(folio)) 2184 + if (!folio_try_get(folio)) 2185 2185 goto retry; 2186 2186 2187 2187 if (unlikely(folio != xas_reload(&xas))) ··· 2313 2313 break; 2314 2314 if (xa_is_sibling(folio)) 2315 2315 break; 2316 - if (!folio_try_get_rcu(folio)) 2316 + if (!folio_try_get(folio)) 2317 2317 goto retry; 2318 2318 2319 2319 if (unlikely(folio != xas_reload(&xas))) ··· 3124 3124 3125 3125 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3126 3126 /* Use the readahead code, even if readahead is disabled */ 3127 - if (vm_flags & VM_HUGEPAGE) { 3127 + if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) { 3128 3128 fpin = maybe_unlock_mmap_for_io(vmf, fpin); 3129 3129 ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); 3130 3130 ra->size = HPAGE_PMD_NR; ··· 3231 3231 if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) 3232 3232 return 0; 3233 3233 3234 - ptep = pte_offset_map(vmf->pmd, vmf->address); 3234 + ptep = pte_offset_map_nolock(vma->vm_mm, vmf->pmd, vmf->address, 3235 + &vmf->ptl); 3235 3236 if (unlikely(!ptep)) 3236 3237 return VM_FAULT_NOPAGE; 3237 3238 ··· 3473 3472 continue; 3474 3473 if (folio_test_locked(folio)) 3475 3474 continue; 3476 - if (!folio_try_get_rcu(folio)) 3475 + if (!folio_try_get(folio)) 3477 3476 continue; 3478 3477 /* Has the page moved or been split? */ 3479 3478 if (unlikely(folio != xas_reload(xas))) ··· 4249 4248 XA_STATE(xas, &mapping->i_pages, first_index); 4250 4249 struct folio *folio; 4251 4250 4251 + /* Flush stats (and potentially sleep) outside the RCU read section. */ 4252 + mem_cgroup_flush_stats_ratelimited(NULL); 4253 + 4252 4254 rcu_read_lock(); 4253 4255 xas_for_each(&xas, folio, last_index) { 4254 4256 int order; ··· 4315 4311 goto resched; 4316 4312 } 4317 4313 #endif 4318 - if (workingset_test_recent(shadow, true, &workingset)) 4314 + if (workingset_test_recent(shadow, true, &workingset, false)) 4319 4315 cs->nr_recently_evicted += nr_pages; 4320 4316 4321 4317 goto resched;
+154 -137
mm/gup.c
··· 76 76 folio = page_folio(page); 77 77 if (WARN_ON_ONCE(folio_ref_count(folio) < 0)) 78 78 return NULL; 79 - if (unlikely(!folio_ref_try_add_rcu(folio, refs))) 79 + if (unlikely(!folio_ref_try_add(folio, refs))) 80 80 return NULL; 81 81 82 82 /* ··· 93 93 folio_put_refs(folio, refs); 94 94 goto retry; 95 95 } 96 - 97 - return folio; 98 - } 99 - 100 - /** 101 - * try_grab_folio() - Attempt to get or pin a folio. 102 - * @page: pointer to page to be grabbed 103 - * @refs: the value to (effectively) add to the folio's refcount 104 - * @flags: gup flags: these are the FOLL_* flag values. 105 - * 106 - * "grab" names in this file mean, "look at flags to decide whether to use 107 - * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. 108 - * 109 - * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the 110 - * same time. (That's true throughout the get_user_pages*() and 111 - * pin_user_pages*() APIs.) Cases: 112 - * 113 - * FOLL_GET: folio's refcount will be incremented by @refs. 114 - * 115 - * FOLL_PIN on large folios: folio's refcount will be incremented by 116 - * @refs, and its pincount will be incremented by @refs. 117 - * 118 - * FOLL_PIN on single-page folios: folio's refcount will be incremented by 119 - * @refs * GUP_PIN_COUNTING_BIAS. 120 - * 121 - * Return: The folio containing @page (with refcount appropriately 122 - * incremented) for success, or NULL upon failure. If neither FOLL_GET 123 - * nor FOLL_PIN was set, that's considered failure, and furthermore, 124 - * a likely bug in the caller, so a warning is also emitted. 125 - */ 126 - struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) 127 - { 128 - struct folio *folio; 129 - 130 - if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) 131 - return NULL; 132 - 133 - if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) 134 - return NULL; 135 - 136 - if (flags & FOLL_GET) 137 - return try_get_folio(page, refs); 138 - 139 - /* FOLL_PIN is set */ 140 - 141 - /* 142 - * Don't take a pin on the zero page - it's not going anywhere 143 - * and it is used in a *lot* of places. 144 - */ 145 - if (is_zero_page(page)) 146 - return page_folio(page); 147 - 148 - folio = try_get_folio(page, refs); 149 - if (!folio) 150 - return NULL; 151 - 152 - /* 153 - * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a 154 - * right zone, so fail and let the caller fall back to the slow 155 - * path. 156 - */ 157 - if (unlikely((flags & FOLL_LONGTERM) && 158 - !folio_is_longterm_pinnable(folio))) { 159 - if (!put_devmap_managed_folio_refs(folio, refs)) 160 - folio_put_refs(folio, refs); 161 - return NULL; 162 - } 163 - 164 - /* 165 - * When pinning a large folio, use an exact count to track it. 166 - * 167 - * However, be sure to *also* increment the normal folio 168 - * refcount field at least once, so that the folio really 169 - * is pinned. That's why the refcount from the earlier 170 - * try_get_folio() is left intact. 171 - */ 172 - if (folio_test_large(folio)) 173 - atomic_add(refs, &folio->_pincount); 174 - else 175 - folio_ref_add(folio, 176 - refs * (GUP_PIN_COUNTING_BIAS - 1)); 177 - /* 178 - * Adjust the pincount before re-checking the PTE for changes. 179 - * This is essentially a smp_mb() and is paired with a memory 180 - * barrier in folio_try_share_anon_rmap_*(). 181 - */ 182 - smp_mb__after_atomic(); 183 - 184 - node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); 185 96 186 97 return folio; 187 98 } ··· 114 203 } 115 204 116 205 /** 117 - * try_grab_page() - elevate a page's refcount by a flag-dependent amount 118 - * @page: pointer to page to be grabbed 119 - * @flags: gup flags: these are the FOLL_* flag values. 206 + * try_grab_folio() - add a folio's refcount by a flag-dependent amount 207 + * @folio: pointer to folio to be grabbed 208 + * @refs: the value to (effectively) add to the folio's refcount 209 + * @flags: gup flags: these are the FOLL_* flag values 120 210 * 121 211 * This might not do anything at all, depending on the flags argument. 122 212 * 123 213 * "grab" names in this file mean, "look at flags to decide whether to use 124 - * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. 214 + * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. 125 215 * 126 216 * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same 127 - * time. Cases: please see the try_grab_folio() documentation, with 128 - * "refs=1". 217 + * time. 129 218 * 130 219 * Return: 0 for success, or if no action was required (if neither FOLL_PIN 131 220 * nor FOLL_GET was set, nothing is done). A negative error code for failure: 132 221 * 133 - * -ENOMEM FOLL_GET or FOLL_PIN was set, but the page could not 222 + * -ENOMEM FOLL_GET or FOLL_PIN was set, but the folio could not 134 223 * be grabbed. 224 + * 225 + * It is called when we have a stable reference for the folio, typically in 226 + * GUP slow path. 135 227 */ 136 - int __must_check try_grab_page(struct page *page, unsigned int flags) 228 + int __must_check try_grab_folio(struct folio *folio, int refs, 229 + unsigned int flags) 137 230 { 138 - struct folio *folio = page_folio(page); 139 - 140 231 if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) 141 232 return -ENOMEM; 142 233 143 - if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) 234 + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(&folio->page))) 144 235 return -EREMOTEIO; 145 236 146 237 if (flags & FOLL_GET) 147 - folio_ref_inc(folio); 238 + folio_ref_add(folio, refs); 148 239 else if (flags & FOLL_PIN) { 149 240 /* 150 241 * Don't take a pin on the zero page - it's not going anywhere 151 242 * and it is used in a *lot* of places. 152 243 */ 153 - if (is_zero_page(page)) 244 + if (is_zero_folio(folio)) 154 245 return 0; 155 246 156 247 /* 157 - * Similar to try_grab_folio(): be sure to *also* 158 - * increment the normal page refcount field at least once, 248 + * Increment the normal page refcount field at least once, 159 249 * so that the page really is pinned. 160 250 */ 161 251 if (folio_test_large(folio)) { 162 - folio_ref_add(folio, 1); 163 - atomic_add(1, &folio->_pincount); 252 + folio_ref_add(folio, refs); 253 + atomic_add(refs, &folio->_pincount); 164 254 } else { 165 - folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); 255 + folio_ref_add(folio, refs * GUP_PIN_COUNTING_BIAS); 166 256 } 167 257 168 - node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1); 258 + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); 169 259 } 170 260 171 261 return 0; ··· 427 515 428 516 return nr; 429 517 } 518 + 519 + /** 520 + * try_grab_folio_fast() - Attempt to get or pin a folio in fast path. 521 + * @page: pointer to page to be grabbed 522 + * @refs: the value to (effectively) add to the folio's refcount 523 + * @flags: gup flags: these are the FOLL_* flag values. 524 + * 525 + * "grab" names in this file mean, "look at flags to decide whether to use 526 + * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. 527 + * 528 + * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the 529 + * same time. (That's true throughout the get_user_pages*() and 530 + * pin_user_pages*() APIs.) Cases: 531 + * 532 + * FOLL_GET: folio's refcount will be incremented by @refs. 533 + * 534 + * FOLL_PIN on large folios: folio's refcount will be incremented by 535 + * @refs, and its pincount will be incremented by @refs. 536 + * 537 + * FOLL_PIN on single-page folios: folio's refcount will be incremented by 538 + * @refs * GUP_PIN_COUNTING_BIAS. 539 + * 540 + * Return: The folio containing @page (with refcount appropriately 541 + * incremented) for success, or NULL upon failure. If neither FOLL_GET 542 + * nor FOLL_PIN was set, that's considered failure, and furthermore, 543 + * a likely bug in the caller, so a warning is also emitted. 544 + * 545 + * It uses add ref unless zero to elevate the folio refcount and must be called 546 + * in fast path only. 547 + */ 548 + static struct folio *try_grab_folio_fast(struct page *page, int refs, 549 + unsigned int flags) 550 + { 551 + struct folio *folio; 552 + 553 + /* Raise warn if it is not called in fast GUP */ 554 + VM_WARN_ON_ONCE(!irqs_disabled()); 555 + 556 + if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) 557 + return NULL; 558 + 559 + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) 560 + return NULL; 561 + 562 + if (flags & FOLL_GET) 563 + return try_get_folio(page, refs); 564 + 565 + /* FOLL_PIN is set */ 566 + 567 + /* 568 + * Don't take a pin on the zero page - it's not going anywhere 569 + * and it is used in a *lot* of places. 570 + */ 571 + if (is_zero_page(page)) 572 + return page_folio(page); 573 + 574 + folio = try_get_folio(page, refs); 575 + if (!folio) 576 + return NULL; 577 + 578 + /* 579 + * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a 580 + * right zone, so fail and let the caller fall back to the slow 581 + * path. 582 + */ 583 + if (unlikely((flags & FOLL_LONGTERM) && 584 + !folio_is_longterm_pinnable(folio))) { 585 + if (!put_devmap_managed_folio_refs(folio, refs)) 586 + folio_put_refs(folio, refs); 587 + return NULL; 588 + } 589 + 590 + /* 591 + * When pinning a large folio, use an exact count to track it. 592 + * 593 + * However, be sure to *also* increment the normal folio 594 + * refcount field at least once, so that the folio really 595 + * is pinned. That's why the refcount from the earlier 596 + * try_get_folio() is left intact. 597 + */ 598 + if (folio_test_large(folio)) 599 + atomic_add(refs, &folio->_pincount); 600 + else 601 + folio_ref_add(folio, 602 + refs * (GUP_PIN_COUNTING_BIAS - 1)); 603 + /* 604 + * Adjust the pincount before re-checking the PTE for changes. 605 + * This is essentially a smp_mb() and is paired with a memory 606 + * barrier in folio_try_share_anon_rmap_*(). 607 + */ 608 + smp_mb__after_atomic(); 609 + 610 + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); 611 + 612 + return folio; 613 + } 430 614 #endif /* CONFIG_ARCH_HAS_HUGEPD || CONFIG_HAVE_GUP_FAST */ 431 615 432 616 #ifdef CONFIG_ARCH_HAS_HUGEPD ··· 543 535 */ 544 536 static int gup_hugepte(struct vm_area_struct *vma, pte_t *ptep, unsigned long sz, 545 537 unsigned long addr, unsigned long end, unsigned int flags, 546 - struct page **pages, int *nr) 538 + struct page **pages, int *nr, bool fast) 547 539 { 548 540 unsigned long pte_end; 549 541 struct page *page; ··· 566 558 page = pte_page(pte); 567 559 refs = record_subpages(page, sz, addr, end, pages + *nr); 568 560 569 - folio = try_grab_folio(page, refs, flags); 570 - if (!folio) 571 - return 0; 561 + if (fast) { 562 + folio = try_grab_folio_fast(page, refs, flags); 563 + if (!folio) 564 + return 0; 565 + } else { 566 + folio = page_folio(page); 567 + if (try_grab_folio(folio, refs, flags)) 568 + return 0; 569 + } 572 570 573 571 if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { 574 572 gup_put_folio(folio, refs, flags); ··· 602 588 static int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, 603 589 unsigned long addr, unsigned int pdshift, 604 590 unsigned long end, unsigned int flags, 605 - struct page **pages, int *nr) 591 + struct page **pages, int *nr, bool fast) 606 592 { 607 593 pte_t *ptep; 608 594 unsigned long sz = 1UL << hugepd_shift(hugepd); ··· 612 598 ptep = hugepte_offset(hugepd, addr, pdshift); 613 599 do { 614 600 next = hugepte_addr_end(addr, end, sz); 615 - ret = gup_hugepte(vma, ptep, sz, addr, end, flags, pages, nr); 601 + ret = gup_hugepte(vma, ptep, sz, addr, end, flags, pages, nr, 602 + fast); 616 603 if (ret != 1) 617 604 return ret; 618 605 } while (ptep++, addr = next, addr != end); ··· 640 625 ptep = hugepte_offset(hugepd, addr, pdshift); 641 626 ptl = huge_pte_lock(h, vma->vm_mm, ptep); 642 627 ret = gup_hugepd(vma, hugepd, addr, pdshift, addr + PAGE_SIZE, 643 - flags, &page, &nr); 628 + flags, &page, &nr, false); 644 629 spin_unlock(ptl); 645 630 646 631 if (ret == 1) { ··· 657 642 static inline int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, 658 643 unsigned long addr, unsigned int pdshift, 659 644 unsigned long end, unsigned int flags, 660 - struct page **pages, int *nr) 645 + struct page **pages, int *nr, bool fast) 661 646 { 662 647 return 0; 663 648 } ··· 744 729 gup_must_unshare(vma, flags, page)) 745 730 return ERR_PTR(-EMLINK); 746 731 747 - ret = try_grab_page(page, flags); 732 + ret = try_grab_folio(page_folio(page), 1, flags); 748 733 if (ret) 749 734 page = ERR_PTR(ret); 750 735 else ··· 821 806 VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && 822 807 !PageAnonExclusive(page), page); 823 808 824 - ret = try_grab_page(page, flags); 809 + ret = try_grab_folio(page_folio(page), 1, flags); 825 810 if (ret) 826 811 return ERR_PTR(ret); 827 812 ··· 983 968 VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && 984 969 !PageAnonExclusive(page), page); 985 970 986 - /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ 987 - ret = try_grab_page(page, flags); 971 + /* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */ 972 + ret = try_grab_folio(page_folio(page), 1, flags); 988 973 if (unlikely(ret)) { 989 974 page = ERR_PTR(ret); 990 975 goto out; ··· 1248 1233 goto unmap; 1249 1234 *page = pte_page(entry); 1250 1235 } 1251 - ret = try_grab_page(*page, gup_flags); 1236 + ret = try_grab_folio(page_folio(*page), 1, gup_flags); 1252 1237 if (unlikely(ret)) 1253 1238 goto unmap; 1254 1239 out: ··· 1651 1636 * pages. 1652 1637 */ 1653 1638 if (page_increm > 1) { 1654 - struct folio *folio; 1639 + struct folio *folio = page_folio(page); 1655 1640 1656 1641 /* 1657 1642 * Since we already hold refcount on the 1658 1643 * large folio, this should never fail. 1659 1644 */ 1660 - folio = try_grab_folio(page, page_increm - 1, 1661 - foll_flags); 1662 - if (WARN_ON_ONCE(!folio)) { 1645 + if (try_grab_folio(folio, page_increm - 1, 1646 + foll_flags)) { 1663 1647 /* 1664 1648 * Release the 1st page ref if the 1665 1649 * folio is problematic, fail hard. 1666 1650 */ 1667 - gup_put_folio(page_folio(page), 1, 1651 + gup_put_folio(folio, 1, 1668 1652 foll_flags); 1669 1653 ret = -EFAULT; 1670 1654 goto out; ··· 2811 2797 * This code is based heavily on the PowerPC implementation by Nick Piggin. 2812 2798 */ 2813 2799 #ifdef CONFIG_HAVE_GUP_FAST 2814 - 2815 2800 /* 2816 2801 * Used in the GUP-fast path to determine whether GUP is permitted to work on 2817 2802 * a specific folio. ··· 2975 2962 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 2976 2963 page = pte_page(pte); 2977 2964 2978 - folio = try_grab_folio(page, 1, flags); 2965 + folio = try_grab_folio_fast(page, 1, flags); 2979 2966 if (!folio) 2980 2967 goto pte_unmap; 2981 2968 ··· 3062 3049 break; 3063 3050 } 3064 3051 3065 - folio = try_grab_folio(page, 1, flags); 3052 + folio = try_grab_folio_fast(page, 1, flags); 3066 3053 if (!folio) { 3067 3054 gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); 3068 3055 break; ··· 3151 3138 page = pmd_page(orig); 3152 3139 refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr); 3153 3140 3154 - folio = try_grab_folio(page, refs, flags); 3141 + folio = try_grab_folio_fast(page, refs, flags); 3155 3142 if (!folio) 3156 3143 return 0; 3157 3144 ··· 3195 3182 page = pud_page(orig); 3196 3183 refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr); 3197 3184 3198 - folio = try_grab_folio(page, refs, flags); 3185 + folio = try_grab_folio_fast(page, refs, flags); 3199 3186 if (!folio) 3200 3187 return 0; 3201 3188 ··· 3235 3222 page = pgd_page(orig); 3236 3223 refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr); 3237 3224 3238 - folio = try_grab_folio(page, refs, flags); 3225 + folio = try_grab_folio_fast(page, refs, flags); 3239 3226 if (!folio) 3240 3227 return 0; 3241 3228 ··· 3289 3276 * pmd format and THP pmd format 3290 3277 */ 3291 3278 if (gup_hugepd(NULL, __hugepd(pmd_val(pmd)), addr, 3292 - PMD_SHIFT, next, flags, pages, nr) != 1) 3279 + PMD_SHIFT, next, flags, pages, nr, 3280 + true) != 1) 3293 3281 return 0; 3294 3282 } else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags, 3295 3283 pages, nr)) ··· 3320 3306 return 0; 3321 3307 } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { 3322 3308 if (gup_hugepd(NULL, __hugepd(pud_val(pud)), addr, 3323 - PUD_SHIFT, next, flags, pages, nr) != 1) 3309 + PUD_SHIFT, next, flags, pages, nr, 3310 + true) != 1) 3324 3311 return 0; 3325 3312 } else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags, 3326 3313 pages, nr)) ··· 3348 3333 BUILD_BUG_ON(p4d_leaf(p4d)); 3349 3334 if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { 3350 3335 if (gup_hugepd(NULL, __hugepd(p4d_val(p4d)), addr, 3351 - P4D_SHIFT, next, flags, pages, nr) != 1) 3336 + P4D_SHIFT, next, flags, pages, nr, 3337 + true) != 1) 3352 3338 return 0; 3353 3339 } else if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags, 3354 3340 pages, nr)) ··· 3378 3362 return; 3379 3363 } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { 3380 3364 if (gup_hugepd(NULL, __hugepd(pgd_val(pgd)), addr, 3381 - PGDIR_SHIFT, next, flags, pages, nr) != 1) 3365 + PGDIR_SHIFT, next, flags, pages, nr, 3366 + true) != 1) 3382 3367 return; 3383 3368 } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags, 3384 3369 pages, nr))
+1 -1
mm/huge_memory.c
··· 1331 1331 if (!*pgmap) 1332 1332 return ERR_PTR(-EFAULT); 1333 1333 page = pfn_to_page(pfn); 1334 - ret = try_grab_page(page, flags); 1334 + ret = try_grab_folio(page_folio(page), 1, flags); 1335 1335 if (ret) 1336 1336 page = ERR_PTR(ret); 1337 1337
+17 -53
mm/hugetlb.c
··· 1625 1625 * folio appears as just a compound page. Otherwise, wait until after 1626 1626 * allocating vmemmap to clear the flag. 1627 1627 * 1628 - * A reference is held on the folio, except in the case of demote. 1629 - * 1630 1628 * Must be called with hugetlb lock held. 1631 1629 */ 1632 - static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, 1633 - bool adjust_surplus, 1634 - bool demote) 1630 + static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, 1631 + bool adjust_surplus) 1635 1632 { 1636 1633 int nid = folio_nid(folio); 1637 1634 ··· 1642 1645 list_del(&folio->lru); 1643 1646 1644 1647 if (folio_test_hugetlb_freed(folio)) { 1648 + folio_clear_hugetlb_freed(folio); 1645 1649 h->free_huge_pages--; 1646 1650 h->free_huge_pages_node[nid]--; 1647 1651 } ··· 1659 1661 if (!folio_test_hugetlb_vmemmap_optimized(folio)) 1660 1662 __folio_clear_hugetlb(folio); 1661 1663 1662 - /* 1663 - * In the case of demote we do not ref count the page as it will soon 1664 - * be turned into a page of smaller size. 1665 - */ 1666 - if (!demote) 1667 - folio_ref_unfreeze(folio, 1); 1668 - 1669 1664 h->nr_huge_pages--; 1670 1665 h->nr_huge_pages_node[nid]--; 1671 - } 1672 - 1673 - static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, 1674 - bool adjust_surplus) 1675 - { 1676 - __remove_hugetlb_folio(h, folio, adjust_surplus, false); 1677 - } 1678 - 1679 - static void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *folio, 1680 - bool adjust_surplus) 1681 - { 1682 - __remove_hugetlb_folio(h, folio, adjust_surplus, true); 1683 1666 } 1684 1667 1685 1668 static void add_hugetlb_folio(struct hstate *h, struct folio *folio, 1686 1669 bool adjust_surplus) 1687 1670 { 1688 - int zeroed; 1689 1671 int nid = folio_nid(folio); 1690 1672 1691 1673 VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio); ··· 1688 1710 * folio_change_private(folio, NULL) cleared it. 1689 1711 */ 1690 1712 folio_set_hugetlb_vmemmap_optimized(folio); 1691 - 1692 - /* 1693 - * This folio is about to be managed by the hugetlb allocator and 1694 - * should have no users. Drop our reference, and check for others 1695 - * just in case. 1696 - */ 1697 - zeroed = folio_put_testzero(folio); 1698 - if (unlikely(!zeroed)) 1699 - /* 1700 - * It is VERY unlikely soneone else has taken a ref 1701 - * on the folio. In this case, we simply return as 1702 - * free_huge_folio() will be called when this other ref 1703 - * is dropped. 1704 - */ 1705 - return; 1706 1713 1707 1714 arch_clear_hugetlb_flags(folio); 1708 1715 enqueue_hugetlb_folio(h, folio); ··· 1726 1763 } 1727 1764 1728 1765 /* 1729 - * Move PageHWPoison flag from head page to the raw error pages, 1730 - * which makes any healthy subpages reusable. 1731 - */ 1732 - if (unlikely(folio_test_hwpoison(folio))) 1733 - folio_clear_hugetlb_hwpoison(folio); 1734 - 1735 - /* 1736 1766 * If vmemmap pages were allocated above, then we need to clear the 1737 1767 * hugetlb flag under the hugetlb lock. 1738 1768 */ ··· 1734 1778 __folio_clear_hugetlb(folio); 1735 1779 spin_unlock_irq(&hugetlb_lock); 1736 1780 } 1781 + 1782 + /* 1783 + * Move PageHWPoison flag from head page to the raw error pages, 1784 + * which makes any healthy subpages reusable. 1785 + */ 1786 + if (unlikely(folio_test_hwpoison(folio))) 1787 + folio_clear_hugetlb_hwpoison(folio); 1788 + 1789 + folio_ref_unfreeze(folio, 1); 1737 1790 1738 1791 /* 1739 1792 * Non-gigantic pages demoted from CMA allocated gigantic pages ··· 2162 2197 nid = numa_mem_id(); 2163 2198 retry: 2164 2199 folio = __folio_alloc(gfp_mask, order, nid, nmask); 2200 + /* Ensure hugetlb folio won't have large_rmappable flag set. */ 2201 + if (folio) 2202 + folio_clear_large_rmappable(folio); 2165 2203 2166 2204 if (folio && !folio_ref_freeze(folio, 1)) { 2167 2205 folio_put(folio); ··· 3047 3079 3048 3080 free_new: 3049 3081 spin_unlock_irq(&hugetlb_lock); 3050 - if (new_folio) { 3051 - /* Folio has a zero ref count, but needs a ref to be freed */ 3052 - folio_ref_unfreeze(new_folio, 1); 3082 + if (new_folio) 3053 3083 update_and_free_hugetlb_folio(h, new_folio, false); 3054 - } 3055 3084 3056 3085 return ret; 3057 3086 } ··· 3903 3938 3904 3939 target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); 3905 3940 3906 - remove_hugetlb_folio_for_demote(h, folio, false); 3941 + remove_hugetlb_folio(h, folio, false); 3907 3942 spin_unlock_irq(&hugetlb_lock); 3908 3943 3909 3944 /* ··· 3917 3952 if (rc) { 3918 3953 /* Allocation of vmemmmap failed, we can not demote folio */ 3919 3954 spin_lock_irq(&hugetlb_lock); 3920 - folio_ref_unfreeze(folio, 1); 3921 3955 add_hugetlb_folio(h, folio, false); 3922 3956 return rc; 3923 3957 }
+16
mm/hugetlb_vmemmap.c
··· 446 446 unsigned long vmemmap_reuse; 447 447 448 448 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); 449 + VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); 450 + 449 451 if (!folio_test_hugetlb_vmemmap_optimized(folio)) 450 452 return 0; 451 453 ··· 483 481 */ 484 482 int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) 485 483 { 484 + /* avoid writes from page_ref_add_unless() while unfolding vmemmap */ 485 + synchronize_rcu(); 486 + 486 487 return __hugetlb_vmemmap_restore_folio(h, folio, 0); 487 488 } 488 489 ··· 509 504 struct folio *folio, *t_folio; 510 505 long restored = 0; 511 506 long ret = 0; 507 + 508 + /* avoid writes from page_ref_add_unless() while unfolding vmemmap */ 509 + synchronize_rcu(); 512 510 513 511 list_for_each_entry_safe(folio, t_folio, folio_list, lru) { 514 512 if (folio_test_hugetlb_vmemmap_optimized(folio)) { ··· 558 550 unsigned long vmemmap_reuse; 559 551 560 552 VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); 553 + VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); 554 + 561 555 if (!vmemmap_should_optimize_folio(h, folio)) 562 556 return ret; 563 557 ··· 611 601 { 612 602 LIST_HEAD(vmemmap_pages); 613 603 604 + /* avoid writes from page_ref_add_unless() while folding vmemmap */ 605 + synchronize_rcu(); 606 + 614 607 __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0); 615 608 free_vmemmap_page_list(&vmemmap_pages); 616 609 } ··· 656 643 } 657 644 658 645 flush_tlb_all(); 646 + 647 + /* avoid writes from page_ref_add_unless() while folding vmemmap */ 648 + synchronize_rcu(); 659 649 660 650 list_for_each_entry(folio, folio_list, lru) { 661 651 int ret;
+2 -2
mm/internal.h
··· 1182 1182 /* 1183 1183 * mm/gup.c 1184 1184 */ 1185 - struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); 1186 - int __must_check try_grab_page(struct page *page, unsigned int flags); 1185 + int __must_check try_grab_folio(struct folio *folio, int refs, 1186 + unsigned int flags); 1187 1187 1188 1188 /* 1189 1189 * mm/huge_memory.c
-11
mm/memcontrol.c
··· 7823 7823 7824 7824 /* Transfer the charge and the css ref */ 7825 7825 commit_charge(new, memcg); 7826 - /* 7827 - * If the old folio is a large folio and is in the split queue, it needs 7828 - * to be removed from the split queue now, in case getting an incorrect 7829 - * split queue in destroy_large_folio() after the memcg of the old folio 7830 - * is cleared. 7831 - * 7832 - * In addition, the old folio is about to be freed after migration, so 7833 - * removing from the split queue a bit earlier seems reasonable. 7834 - */ 7835 - if (folio_test_large(old) && folio_test_large_rmappable(old)) 7836 - folio_undo_large_rmappable(old); 7837 7826 old->memcg_data = 0; 7838 7827 } 7839 7828
+13
mm/migrate.c
··· 415 415 if (folio_ref_count(folio) != expected_count) 416 416 return -EAGAIN; 417 417 418 + /* Take off deferred split queue while frozen and memcg set */ 419 + if (folio_test_large(folio) && 420 + folio_test_large_rmappable(folio)) { 421 + if (!folio_ref_freeze(folio, expected_count)) 422 + return -EAGAIN; 423 + folio_undo_large_rmappable(folio); 424 + folio_ref_unfreeze(folio, expected_count); 425 + } 426 + 418 427 /* No turning back from here */ 419 428 newfolio->index = folio->index; 420 429 newfolio->mapping = folio->mapping; ··· 441 432 xas_unlock_irq(&xas); 442 433 return -EAGAIN; 443 434 } 435 + 436 + /* Take off deferred split queue while frozen and memcg set */ 437 + if (folio_test_large(folio) && folio_test_large_rmappable(folio)) 438 + folio_undo_large_rmappable(folio); 444 439 445 440 /* 446 441 * Now we know that no one else is looking at the folio:
+4 -4
mm/readahead.c
··· 503 503 504 504 limit = min(limit, index + ra->size - 1); 505 505 506 - if (new_order < MAX_PAGECACHE_ORDER) { 506 + if (new_order < MAX_PAGECACHE_ORDER) 507 507 new_order += 2; 508 - new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order); 509 - new_order = min_t(unsigned int, new_order, ilog2(ra->size)); 510 - } 508 + 509 + new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order); 510 + new_order = min_t(unsigned int, new_order, ilog2(ra->size)); 511 511 512 512 /* See comment in page_cache_ra_unbounded() */ 513 513 nofs = memalloc_nofs_save();
+13 -2
mm/shmem.c
··· 541 541 542 542 static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; 543 543 544 - bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, 545 - struct mm_struct *mm, unsigned long vm_flags) 544 + static bool __shmem_is_huge(struct inode *inode, pgoff_t index, 545 + bool shmem_huge_force, struct mm_struct *mm, 546 + unsigned long vm_flags) 546 547 { 547 548 loff_t i_size; 548 549 ··· 572 571 default: 573 572 return false; 574 573 } 574 + } 575 + 576 + bool shmem_is_huge(struct inode *inode, pgoff_t index, 577 + bool shmem_huge_force, struct mm_struct *mm, 578 + unsigned long vm_flags) 579 + { 580 + if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER) 581 + return false; 582 + 583 + return __shmem_is_huge(inode, index, shmem_huge_force, mm, vm_flags); 575 584 } 576 585 577 586 #if defined(CONFIG_SYSFS)
+9 -1
mm/vmalloc.c
··· 2543 2543 static struct xarray * 2544 2544 addr_to_vb_xa(unsigned long addr) 2545 2545 { 2546 - int index = (addr / VMAP_BLOCK_SIZE) % num_possible_cpus(); 2546 + int index = (addr / VMAP_BLOCK_SIZE) % nr_cpu_ids; 2547 + 2548 + /* 2549 + * Please note, nr_cpu_ids points on a highest set 2550 + * possible bit, i.e. we never invoke cpumask_next() 2551 + * if an index points on it which is nr_cpu_ids - 1. 2552 + */ 2553 + if (!cpu_possible(index)) 2554 + index = cpumask_next(index, cpu_possible_mask); 2547 2555 2548 2556 return &per_cpu(vmap_block_queue, index).vmap_blocks; 2549 2557 }
+11 -3
mm/workingset.c
··· 412 412 * @file: whether the corresponding folio is from the file lru. 413 413 * @workingset: where the workingset value unpacked from shadow should 414 414 * be stored. 415 + * @flush: whether to flush cgroup rstat. 415 416 * 416 417 * Return: true if the shadow is for a recently evicted folio; false otherwise. 417 418 */ 418 - bool workingset_test_recent(void *shadow, bool file, bool *workingset) 419 + bool workingset_test_recent(void *shadow, bool file, bool *workingset, 420 + bool flush) 419 421 { 420 422 struct mem_cgroup *eviction_memcg; 421 423 struct lruvec *eviction_lruvec; ··· 469 467 470 468 /* 471 469 * Flush stats (and potentially sleep) outside the RCU read section. 470 + * 471 + * Note that workingset_test_recent() itself might be called in RCU read 472 + * section (for e.g, in cachestat) - these callers need to skip flushing 473 + * stats (via the flush argument). 474 + * 472 475 * XXX: With per-memcg flushing and thresholding, is ratelimiting 473 476 * still needed here? 474 477 */ 475 - mem_cgroup_flush_stats_ratelimited(eviction_memcg); 478 + if (flush) 479 + mem_cgroup_flush_stats_ratelimited(eviction_memcg); 476 480 477 481 eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); 478 482 refault = atomic_long_read(&eviction_lruvec->nonresident_age); ··· 566 558 567 559 mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); 568 560 569 - if (!workingset_test_recent(shadow, file, &workingset)) 561 + if (!workingset_test_recent(shadow, file, &workingset, true)) 570 562 return; 571 563 572 564 folio_set_active(folio);