Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'akpm' (patches from Andrew)

Merge yet more updates from Andrew Morton:
"Subsystems affected by this patch series: mm (memcg, migration,
pagemap, gup, madvise, vmalloc), ia64, and misc"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (31 commits)
mm: remove duplicate include statement in mmu.c
mm: remove the filename in the top of file comment in vmalloc.c
mm: cleanup the gfp_mask handling in __vmalloc_area_node
mm: remove alloc_vm_area
x86/xen: open code alloc_vm_area in arch_gnttab_valloc
xen/xenbus: use apply_to_page_range directly in xenbus_map_ring_pv
drm/i915: use vmap in i915_gem_object_map
drm/i915: stop using kmap in i915_gem_object_map
drm/i915: use vmap in shmem_pin_map
zsmalloc: switch from alloc_vm_area to get_vm_area
mm: allow a NULL fn callback in apply_to_page_range
mm: add a vmap_pfn function
mm: add a VM_MAP_PUT_PAGES flag for vmap
mm: update the documentation for vfree
mm/madvise: introduce process_madvise() syscall: an external memory hinting API
pid: move pidfd_get_pid() to pid.c
mm/madvise: pass mm to do_madvise
selftests/vm: 10x speedup for hmm-tests
binfmt_elf: take the mmap lock around find_extend_vma()
mm/gup_benchmark: take the mmap lock around GUP
...

+601 -448
+1
arch/alpha/kernel/syscalls/syscall.tbl
··· 479 479 547 common openat2 sys_openat2 480 480 548 common pidfd_getfd sys_pidfd_getfd 481 481 549 common faccessat2 sys_faccessat2 482 + 550 common process_madvise sys_process_madvise
-1
arch/arm/mm/mmu.c
··· 17 17 18 18 #include <asm/cp15.h> 19 19 #include <asm/cputype.h> 20 - #include <asm/sections.h> 21 20 #include <asm/cachetype.h> 22 21 #include <asm/fixmap.h> 23 22 #include <asm/sections.h>
+1
arch/arm/tools/syscall.tbl
··· 453 453 437 common openat2 sys_openat2 454 454 438 common pidfd_getfd sys_pidfd_getfd 455 455 439 common faccessat2 sys_faccessat2 456 + 440 common process_madvise sys_process_madvise
+1 -1
arch/arm64/include/asm/unistd.h
··· 38 38 #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) 39 39 #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) 40 40 41 - #define __NR_compat_syscalls 440 41 + #define __NR_compat_syscalls 441 42 42 #endif 43 43 44 44 #define __ARCH_WANT_SYS_CLONE
+2
arch/arm64/include/asm/unistd32.h
··· 887 887 __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) 888 888 #define __NR_faccessat2 439 889 889 __SYSCALL(__NR_faccessat2, sys_faccessat2) 890 + #define __NR_process_madvise 440 891 + __SYSCALL(__NR_process_madvise, sys_process_madvise) 890 892 891 893 /* 892 894 * Please add new compat syscalls above this comment and update
+1 -1
arch/ia64/kernel/Makefile
··· 40 40 endif 41 41 obj-$(CONFIG_INTEL_IOMMU) += pci-dma.o 42 42 43 - obj-$(CONFIG_BINFMT_ELF) += elfcore.o 43 + obj-$(CONFIG_ELF_CORE) += elfcore.o 44 44 45 45 # fp_emulate() expects f2-f5,f16-f31 to contain the user-level state. 46 46 CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31
+1
arch/ia64/kernel/syscalls/syscall.tbl
··· 360 360 437 common openat2 sys_openat2 361 361 438 common pidfd_getfd sys_pidfd_getfd 362 362 439 common faccessat2 sys_faccessat2 363 + 440 common process_madvise sys_process_madvise
+1
arch/m68k/kernel/syscalls/syscall.tbl
··· 439 439 437 common openat2 sys_openat2 440 440 438 common pidfd_getfd sys_pidfd_getfd 441 441 439 common faccessat2 sys_faccessat2 442 + 440 common process_madvise sys_process_madvise
+1
arch/microblaze/kernel/syscalls/syscall.tbl
··· 445 445 437 common openat2 sys_openat2 446 446 438 common pidfd_getfd sys_pidfd_getfd 447 447 439 common faccessat2 sys_faccessat2 448 + 440 common process_madvise sys_process_madvise
+1
arch/mips/kernel/syscalls/syscall_n32.tbl
··· 378 378 437 n32 openat2 sys_openat2 379 379 438 n32 pidfd_getfd sys_pidfd_getfd 380 380 439 n32 faccessat2 sys_faccessat2 381 + 440 n32 process_madvise sys_process_madvise
+1
arch/mips/kernel/syscalls/syscall_n64.tbl
··· 354 354 437 n64 openat2 sys_openat2 355 355 438 n64 pidfd_getfd sys_pidfd_getfd 356 356 439 n64 faccessat2 sys_faccessat2 357 + 440 n64 process_madvise sys_process_madvise
+1
arch/mips/kernel/syscalls/syscall_o32.tbl
··· 427 427 437 o32 openat2 sys_openat2 428 428 438 o32 pidfd_getfd sys_pidfd_getfd 429 429 439 o32 faccessat2 sys_faccessat2 430 + 440 o32 process_madvise sys_process_madvise
+1
arch/parisc/kernel/syscalls/syscall.tbl
··· 437 437 437 common openat2 sys_openat2 438 438 438 common pidfd_getfd sys_pidfd_getfd 439 439 439 common faccessat2 sys_faccessat2 440 + 440 common process_madvise sys_process_madvise
+1
arch/powerpc/kernel/syscalls/syscall.tbl
··· 529 529 437 common openat2 sys_openat2 530 530 438 common pidfd_getfd sys_pidfd_getfd 531 531 439 common faccessat2 sys_faccessat2 532 + 440 common process_madvise sys_process_madvise
+1
arch/s390/kernel/syscalls/syscall.tbl
··· 442 442 437 common openat2 sys_openat2 sys_openat2 443 443 438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd 444 444 439 common faccessat2 sys_faccessat2 sys_faccessat2 445 + 440 common process_madvise sys_process_madvise sys_process_madvise
+1
arch/sh/kernel/syscalls/syscall.tbl
··· 442 442 437 common openat2 sys_openat2 443 443 438 common pidfd_getfd sys_pidfd_getfd 444 444 439 common faccessat2 sys_faccessat2 445 + 440 common process_madvise sys_process_madvise
+1
arch/sparc/kernel/syscalls/syscall.tbl
··· 485 485 437 common openat2 sys_openat2 486 486 438 common pidfd_getfd sys_pidfd_getfd 487 487 439 common faccessat2 sys_faccessat2 488 + 440 common process_madvise sys_process_madvise
+1
arch/x86/entry/syscalls/syscall_32.tbl
··· 444 444 437 i386 openat2 sys_openat2 445 445 438 i386 pidfd_getfd sys_pidfd_getfd 446 446 439 i386 faccessat2 sys_faccessat2 447 + 440 i386 process_madvise sys_process_madvise
+1
arch/x86/entry/syscalls/syscall_64.tbl
··· 361 361 437 common openat2 sys_openat2 362 362 438 common pidfd_getfd sys_pidfd_getfd 363 363 439 common faccessat2 sys_faccessat2 364 + 440 common process_madvise sys_process_madvise 364 365 365 366 # 366 367 # x32-specific system call numbers start at 512 to avoid cache impact
+20 -7
arch/x86/xen/grant-table.c
··· 25 25 static struct gnttab_vm_area { 26 26 struct vm_struct *area; 27 27 pte_t **ptes; 28 + int idx; 28 29 } gnttab_shared_vm_area, gnttab_status_vm_area; 29 30 30 31 int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, ··· 91 90 } 92 91 } 93 92 93 + static int gnttab_apply(pte_t *pte, unsigned long addr, void *data) 94 + { 95 + struct gnttab_vm_area *area = data; 96 + 97 + area->ptes[area->idx++] = pte; 98 + return 0; 99 + } 100 + 94 101 static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames) 95 102 { 96 103 area->ptes = kmalloc_array(nr_frames, sizeof(*area->ptes), GFP_KERNEL); 97 104 if (area->ptes == NULL) 98 105 return -ENOMEM; 99 - 100 - area->area = alloc_vm_area(PAGE_SIZE * nr_frames, area->ptes); 101 - if (area->area == NULL) { 102 - kfree(area->ptes); 103 - return -ENOMEM; 104 - } 105 - 106 + area->area = get_vm_area(PAGE_SIZE * nr_frames, VM_IOREMAP); 107 + if (!area->area) 108 + goto out_free_ptes; 109 + if (apply_to_page_range(&init_mm, (unsigned long)area->area->addr, 110 + PAGE_SIZE * nr_frames, gnttab_apply, area)) 111 + goto out_free_vm_area; 106 112 return 0; 113 + out_free_vm_area: 114 + free_vm_area(area->area); 115 + out_free_ptes: 116 + kfree(area->ptes); 117 + return -ENOMEM; 107 118 } 108 119 109 120 static void arch_gnttab_vfree(struct gnttab_vm_area *area)
+1
arch/xtensa/kernel/syscalls/syscall.tbl
··· 410 410 437 common openat2 sys_openat2 411 411 438 common pidfd_getfd sys_pidfd_getfd 412 412 439 common faccessat2 sys_faccessat2 413 + 440 common process_madvise sys_process_madvise
+1
drivers/gpu/drm/i915/Kconfig
··· 25 25 select CRC32 26 26 select SND_HDA_I915 if SND_HDA_CORE 27 27 select CEC_CORE if CEC_NOTIFIER 28 + select VMAP_PFN 28 29 help 29 30 Choose this option if you have a system that has "Intel Graphics 30 31 Media Accelerator" or "HD Graphics" integrated graphics,
+62 -74
drivers/gpu/drm/i915/gem/i915_gem_pages.c
··· 162 162 { 163 163 if (is_vmalloc_addr(ptr)) 164 164 vunmap(ptr); 165 - else 166 - kunmap(kmap_to_page(ptr)); 167 165 } 168 166 169 167 struct sg_table * ··· 232 234 return err; 233 235 } 234 236 235 - static inline pte_t iomap_pte(resource_size_t base, 236 - dma_addr_t offset, 237 - pgprot_t prot) 238 - { 239 - return pte_mkspecial(pfn_pte((base + offset) >> PAGE_SHIFT, prot)); 240 - } 241 - 242 237 /* The 'mapping' part of i915_gem_object_pin_map() below */ 243 - static void *i915_gem_object_map(struct drm_i915_gem_object *obj, 244 - enum i915_map_type type) 238 + static void *i915_gem_object_map_page(struct drm_i915_gem_object *obj, 239 + enum i915_map_type type) 245 240 { 246 - unsigned long n_pte = obj->base.size >> PAGE_SHIFT; 247 - struct sg_table *sgt = obj->mm.pages; 248 - pte_t *stack[32], **mem; 249 - struct vm_struct *area; 241 + unsigned long n_pages = obj->base.size >> PAGE_SHIFT, i; 242 + struct page *stack[32], **pages = stack, *page; 243 + struct sgt_iter iter; 250 244 pgprot_t pgprot; 245 + void *vaddr; 251 246 252 - if (!i915_gem_object_has_struct_page(obj) && type != I915_MAP_WC) 253 - return NULL; 254 - 255 - if (GEM_WARN_ON(type == I915_MAP_WC && 256 - !static_cpu_has(X86_FEATURE_PAT))) 257 - return NULL; 258 - 259 - /* A single page can always be kmapped */ 260 - if (n_pte == 1 && type == I915_MAP_WB) { 261 - struct page *page = sg_page(sgt->sgl); 262 - 247 + switch (type) { 248 + default: 249 + MISSING_CASE(type); 250 + fallthrough; /* to use PAGE_KERNEL anyway */ 251 + case I915_MAP_WB: 263 252 /* 264 253 * On 32b, highmem using a finite set of indirect PTE (i.e. 265 254 * vmap) to provide virtual mappings of the high pages. ··· 262 277 * forever. 263 278 * 264 279 * So if the page is beyond the 32b boundary, make an explicit 265 - * vmap. On 64b, this check will be optimised away as we can 266 - * directly kmap any page on the system. 280 + * vmap. 267 281 */ 268 - if (!PageHighMem(page)) 269 - return kmap(page); 270 - } 271 - 272 - mem = stack; 273 - if (n_pte > ARRAY_SIZE(stack)) { 274 - /* Too big for stack -- allocate temporary array instead */ 275 - mem = kvmalloc_array(n_pte, sizeof(*mem), GFP_KERNEL); 276 - if (!mem) 277 - return NULL; 278 - } 279 - 280 - area = alloc_vm_area(obj->base.size, mem); 281 - if (!area) { 282 - if (mem != stack) 283 - kvfree(mem); 284 - return NULL; 285 - } 286 - 287 - switch (type) { 288 - default: 289 - MISSING_CASE(type); 290 - fallthrough; /* to use PAGE_KERNEL anyway */ 291 - case I915_MAP_WB: 282 + if (n_pages == 1 && !PageHighMem(sg_page(obj->mm.pages->sgl))) 283 + return page_address(sg_page(obj->mm.pages->sgl)); 292 284 pgprot = PAGE_KERNEL; 293 285 break; 294 286 case I915_MAP_WC: ··· 273 311 break; 274 312 } 275 313 276 - if (i915_gem_object_has_struct_page(obj)) { 277 - struct sgt_iter iter; 278 - struct page *page; 279 - pte_t **ptes = mem; 280 - 281 - for_each_sgt_page(page, iter, sgt) 282 - **ptes++ = mk_pte(page, pgprot); 283 - } else { 284 - resource_size_t iomap; 285 - struct sgt_iter iter; 286 - pte_t **ptes = mem; 287 - dma_addr_t addr; 288 - 289 - iomap = obj->mm.region->iomap.base; 290 - iomap -= obj->mm.region->region.start; 291 - 292 - for_each_sgt_daddr(addr, iter, sgt) 293 - **ptes++ = iomap_pte(iomap, addr, pgprot); 314 + if (n_pages > ARRAY_SIZE(stack)) { 315 + /* Too big for stack -- allocate temporary array instead */ 316 + pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL); 317 + if (!pages) 318 + return NULL; 294 319 } 295 320 296 - if (mem != stack) 297 - kvfree(mem); 321 + i = 0; 322 + for_each_sgt_page(page, iter, obj->mm.pages) 323 + pages[i++] = page; 324 + vaddr = vmap(pages, n_pages, 0, pgprot); 325 + if (pages != stack) 326 + kvfree(pages); 327 + return vaddr; 328 + } 298 329 299 - return area->addr; 330 + static void *i915_gem_object_map_pfn(struct drm_i915_gem_object *obj, 331 + enum i915_map_type type) 332 + { 333 + resource_size_t iomap = obj->mm.region->iomap.base - 334 + obj->mm.region->region.start; 335 + unsigned long n_pfn = obj->base.size >> PAGE_SHIFT; 336 + unsigned long stack[32], *pfns = stack, i; 337 + struct sgt_iter iter; 338 + dma_addr_t addr; 339 + void *vaddr; 340 + 341 + if (type != I915_MAP_WC) 342 + return NULL; 343 + 344 + if (n_pfn > ARRAY_SIZE(stack)) { 345 + /* Too big for stack -- allocate temporary array instead */ 346 + pfns = kvmalloc_array(n_pfn, sizeof(*pfns), GFP_KERNEL); 347 + if (!pfns) 348 + return NULL; 349 + } 350 + 351 + i = 0; 352 + for_each_sgt_daddr(addr, iter, obj->mm.pages) 353 + pfns[i++] = (iomap + addr) >> PAGE_SHIFT; 354 + vaddr = vmap_pfn(pfns, n_pfn, pgprot_writecombine(PAGE_KERNEL_IO)); 355 + if (pfns != stack) 356 + kvfree(pfns); 357 + return vaddr; 300 358 } 301 359 302 360 /* get, pin, and map the pages of the object into kernel space */ ··· 368 386 } 369 387 370 388 if (!ptr) { 371 - ptr = i915_gem_object_map(obj, type); 389 + if (GEM_WARN_ON(type == I915_MAP_WC && 390 + !static_cpu_has(X86_FEATURE_PAT))) 391 + ptr = NULL; 392 + else if (i915_gem_object_has_struct_page(obj)) 393 + ptr = i915_gem_object_map_page(obj, type); 394 + else 395 + ptr = i915_gem_object_map_pfn(obj, type); 372 396 if (!ptr) { 373 397 err = -ENOMEM; 374 398 goto err_unpin;
+18 -58
drivers/gpu/drm/i915/gt/shmem_utils.c
··· 49 49 return file; 50 50 } 51 51 52 - static size_t shmem_npte(struct file *file) 53 - { 54 - return file->f_mapping->host->i_size >> PAGE_SHIFT; 55 - } 56 - 57 - static void __shmem_unpin_map(struct file *file, void *ptr, size_t n_pte) 58 - { 59 - unsigned long pfn; 60 - 61 - vunmap(ptr); 62 - 63 - for (pfn = 0; pfn < n_pte; pfn++) { 64 - struct page *page; 65 - 66 - page = shmem_read_mapping_page_gfp(file->f_mapping, pfn, 67 - GFP_KERNEL); 68 - if (!WARN_ON(IS_ERR(page))) { 69 - put_page(page); 70 - put_page(page); 71 - } 72 - } 73 - } 74 - 75 52 void *shmem_pin_map(struct file *file) 76 53 { 77 - const size_t n_pte = shmem_npte(file); 78 - pte_t *stack[32], **ptes, **mem; 79 - struct vm_struct *area; 80 - unsigned long pfn; 54 + struct page **pages; 55 + size_t n_pages, i; 56 + void *vaddr; 81 57 82 - mem = stack; 83 - if (n_pte > ARRAY_SIZE(stack)) { 84 - mem = kvmalloc_array(n_pte, sizeof(*mem), GFP_KERNEL); 85 - if (!mem) 86 - return NULL; 87 - } 88 - 89 - area = alloc_vm_area(n_pte << PAGE_SHIFT, mem); 90 - if (!area) { 91 - if (mem != stack) 92 - kvfree(mem); 58 + n_pages = file->f_mapping->host->i_size >> PAGE_SHIFT; 59 + pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL); 60 + if (!pages) 93 61 return NULL; 94 - } 95 62 96 - ptes = mem; 97 - for (pfn = 0; pfn < n_pte; pfn++) { 98 - struct page *page; 99 - 100 - page = shmem_read_mapping_page_gfp(file->f_mapping, pfn, 101 - GFP_KERNEL); 102 - if (IS_ERR(page)) 63 + for (i = 0; i < n_pages; i++) { 64 + pages[i] = shmem_read_mapping_page_gfp(file->f_mapping, i, 65 + GFP_KERNEL); 66 + if (IS_ERR(pages[i])) 103 67 goto err_page; 104 - 105 - **ptes++ = mk_pte(page, PAGE_KERNEL); 106 68 } 107 69 108 - if (mem != stack) 109 - kvfree(mem); 110 - 70 + vaddr = vmap(pages, n_pages, VM_MAP_PUT_PAGES, PAGE_KERNEL); 71 + if (!vaddr) 72 + goto err_page; 111 73 mapping_set_unevictable(file->f_mapping); 112 - return area->addr; 113 - 74 + return vaddr; 114 75 err_page: 115 - if (mem != stack) 116 - kvfree(mem); 117 - 118 - __shmem_unpin_map(file, area->addr, pfn); 76 + while (--i >= 0) 77 + put_page(pages[i]); 78 + kvfree(pages); 119 79 return NULL; 120 80 } 121 81 122 82 void shmem_unpin_map(struct file *file, void *ptr) 123 83 { 124 84 mapping_clear_unevictable(file->f_mapping); 125 - __shmem_unpin_map(file, ptr, shmem_npte(file)); 85 + vfree(ptr); 126 86 } 127 87 128 88 static int __shmem_rw(struct file *file, loff_t off,
+16 -14
drivers/xen/xenbus/xenbus_client.c
··· 73 73 struct xenbus_map_node *node; 74 74 75 75 /* Why do we need two arrays? See comment of __xenbus_map_ring */ 76 - union { 77 - unsigned long addrs[XENBUS_MAX_RING_GRANTS]; 78 - pte_t *ptes[XENBUS_MAX_RING_GRANTS]; 79 - }; 76 + unsigned long addrs[XENBUS_MAX_RING_GRANTS]; 80 77 phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS]; 81 78 82 79 struct gnttab_map_grant_ref map[XENBUS_MAX_RING_GRANTS]; 83 80 struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS]; 84 81 85 - unsigned int idx; /* HVM only. */ 82 + unsigned int idx; 86 83 }; 87 84 88 85 static DEFINE_SPINLOCK(xenbus_valloc_lock); ··· 683 686 EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); 684 687 685 688 #ifdef CONFIG_XEN_PV 689 + static int map_ring_apply(pte_t *pte, unsigned long addr, void *data) 690 + { 691 + struct map_ring_valloc *info = data; 692 + 693 + info->phys_addrs[info->idx++] = arbitrary_virt_to_machine(pte).maddr; 694 + return 0; 695 + } 696 + 686 697 static int xenbus_map_ring_pv(struct xenbus_device *dev, 687 698 struct map_ring_valloc *info, 688 699 grant_ref_t *gnt_refs, ··· 699 694 { 700 695 struct xenbus_map_node *node = info->node; 701 696 struct vm_struct *area; 702 - int err = GNTST_okay; 703 - int i; 704 - bool leaked; 697 + bool leaked = false; 698 + int err = -ENOMEM; 705 699 706 - area = alloc_vm_area(XEN_PAGE_SIZE * nr_grefs, info->ptes); 700 + area = get_vm_area(XEN_PAGE_SIZE * nr_grefs, VM_IOREMAP); 707 701 if (!area) 708 702 return -ENOMEM; 709 - 710 - for (i = 0; i < nr_grefs; i++) 711 - info->phys_addrs[i] = 712 - arbitrary_virt_to_machine(info->ptes[i]).maddr; 713 - 703 + if (apply_to_page_range(&init_mm, (unsigned long)area->addr, 704 + XEN_PAGE_SIZE * nr_grefs, map_ring_apply, info)) 705 + goto failed; 714 706 err = __xenbus_map_ring(dev, gnt_refs, nr_grefs, node->handles, 715 707 info, GNTMAP_host_map | GNTMAP_contains_pte, 716 708 &leaked);
+3
fs/binfmt_elf.c
··· 310 310 * Grow the stack manually; some architectures have a limit on how 311 311 * far ahead a user-space access may be in order to grow the stack. 312 312 */ 313 + if (mmap_read_lock_killable(mm)) 314 + return -EINTR; 313 315 vma = find_extend_vma(mm, bprm->p); 316 + mmap_read_unlock(mm); 314 317 if (!vma) 315 318 return -EFAULT; 316 319
+3 -3
fs/buffer.c
··· 842 842 struct buffer_head *bh, *head; 843 843 gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; 844 844 long offset; 845 - struct mem_cgroup *memcg; 845 + struct mem_cgroup *memcg, *old_memcg; 846 846 847 847 if (retry) 848 848 gfp |= __GFP_NOFAIL; 849 849 850 850 memcg = get_mem_cgroup_from_page(page); 851 - memalloc_use_memcg(memcg); 851 + old_memcg = set_active_memcg(memcg); 852 852 853 853 head = NULL; 854 854 offset = PAGE_SIZE; ··· 867 867 set_bh_page(bh, page, offset); 868 868 } 869 869 out: 870 - memalloc_unuse_memcg(); 870 + set_active_memcg(old_memcg); 871 871 mem_cgroup_put(memcg); 872 872 return head; 873 873 /*
+1 -1
fs/io_uring.c
··· 3989 3989 if (force_nonblock) 3990 3990 return -EAGAIN; 3991 3991 3992 - ret = do_madvise(ma->addr, ma->len, ma->advice); 3992 + ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); 3993 3993 if (ret < 0) 3994 3994 req_set_fail_links(req); 3995 3995 io_req_complete(req, ret);
+3 -2
fs/notify/fanotify/fanotify.c
··· 531 531 struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir); 532 532 const struct path *path = fsnotify_data_path(data, data_type); 533 533 unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); 534 + struct mem_cgroup *old_memcg; 534 535 struct inode *child = NULL; 535 536 bool name_event = false; 536 537 ··· 581 580 gfp |= __GFP_RETRY_MAYFAIL; 582 581 583 582 /* Whoever is interested in the event, pays for the allocation. */ 584 - memalloc_use_memcg(group->memcg); 583 + old_memcg = set_active_memcg(group->memcg); 585 584 586 585 if (fanotify_is_perm_event(mask)) { 587 586 event = fanotify_alloc_perm_event(path, gfp); ··· 609 608 event->pid = get_pid(task_tgid(current)); 610 609 611 610 out: 612 - memalloc_unuse_memcg(); 611 + set_active_memcg(old_memcg); 613 612 return event; 614 613 } 615 614
+3 -2
fs/notify/inotify/inotify_fsnotify.c
··· 66 66 int ret; 67 67 int len = 0; 68 68 int alloc_len = sizeof(struct inotify_event_info); 69 + struct mem_cgroup *old_memcg; 69 70 70 71 if ((inode_mark->mask & FS_EXCL_UNLINK) && 71 72 path && d_unlinked(path->dentry)) ··· 88 87 * trigger OOM killer in the target monitoring memcg as it may have 89 88 * security repercussion. 90 89 */ 91 - memalloc_use_memcg(group->memcg); 90 + old_memcg = set_active_memcg(group->memcg); 92 91 event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); 93 - memalloc_unuse_memcg(); 92 + set_active_memcg(old_memcg); 94 93 95 94 if (unlikely(!event)) { 96 95 /*
-12
include/linux/memcontrol.h
··· 1531 1531 return static_branch_likely(&memcg_kmem_enabled_key); 1532 1532 } 1533 1533 1534 - static inline bool memcg_kmem_bypass(void) 1535 - { 1536 - if (in_interrupt()) 1537 - return true; 1538 - 1539 - /* Allow remote memcg charging in kthread contexts. */ 1540 - if ((!current->mm || (current->flags & PF_KTHREAD)) && 1541 - !current->active_memcg) 1542 - return true; 1543 - return false; 1544 - } 1545 - 1546 1534 static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, 1547 1535 int order) 1548 1536 {
+1 -1
include/linux/mm.h
··· 2579 2579 struct list_head *uf, bool downgrade); 2580 2580 extern int do_munmap(struct mm_struct *, unsigned long, size_t, 2581 2581 struct list_head *uf); 2582 - extern int do_madvise(unsigned long start, size_t len_in, int behavior); 2582 + extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); 2583 2583 2584 2584 #ifdef CONFIG_MMU 2585 2585 extern int __mm_populate(unsigned long addr, unsigned long len,
+1
include/linux/pid.h
··· 77 77 struct file; 78 78 79 79 extern struct pid *pidfd_pid(const struct file *file); 80 + struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); 80 81 81 82 static inline struct pid *get_pid(struct pid *pid) 82 83 {
+19 -20
include/linux/sched/mm.h
··· 279 279 #endif 280 280 281 281 #ifdef CONFIG_MEMCG 282 + DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); 282 283 /** 283 - * memalloc_use_memcg - Starts the remote memcg charging scope. 284 + * set_active_memcg - Starts the remote memcg charging scope. 284 285 * @memcg: memcg to charge. 285 286 * 286 287 * This function marks the beginning of the remote memcg charging scope. All the 287 288 * __GFP_ACCOUNT allocations till the end of the scope will be charged to the 288 289 * given memcg. 289 290 * 290 - * NOTE: This function is not nesting safe. 291 + * NOTE: This function can nest. Users must save the return value and 292 + * reset the previous value after their own charging scope is over. 291 293 */ 292 - static inline void memalloc_use_memcg(struct mem_cgroup *memcg) 294 + static inline struct mem_cgroup * 295 + set_active_memcg(struct mem_cgroup *memcg) 293 296 { 294 - WARN_ON_ONCE(current->active_memcg); 295 - current->active_memcg = memcg; 296 - } 297 + struct mem_cgroup *old; 297 298 298 - /** 299 - * memalloc_unuse_memcg - Ends the remote memcg charging scope. 300 - * 301 - * This function marks the end of the remote memcg charging scope started by 302 - * memalloc_use_memcg(). 303 - */ 304 - static inline void memalloc_unuse_memcg(void) 305 - { 306 - current->active_memcg = NULL; 299 + if (in_interrupt()) { 300 + old = this_cpu_read(int_active_memcg); 301 + this_cpu_write(int_active_memcg, memcg); 302 + } else { 303 + old = current->active_memcg; 304 + current->active_memcg = memcg; 305 + } 306 + 307 + return old; 307 308 } 308 309 #else 309 - static inline void memalloc_use_memcg(struct mem_cgroup *memcg) 310 + static inline struct mem_cgroup * 311 + set_active_memcg(struct mem_cgroup *memcg) 310 312 { 311 - } 312 - 313 - static inline void memalloc_unuse_memcg(void) 314 - { 313 + return NULL; 315 314 } 316 315 #endif 317 316
+2
include/linux/syscalls.h
··· 879 879 asmlinkage long sys_mincore(unsigned long start, size_t len, 880 880 unsigned char __user * vec); 881 881 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); 882 + asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, 883 + size_t vlen, int behavior, unsigned int flags); 882 884 asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, 883 885 unsigned long prot, unsigned long pgoff, 884 886 unsigned long flags);
+3 -4
include/linux/vmalloc.h
··· 24 24 #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ 25 25 #define VM_NO_GUARD 0x00000040 /* don't add guard page */ 26 26 #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ 27 + #define VM_MAP_PUT_PAGES 0x00000100 /* put pages and free array in vfree */ 27 28 28 29 /* 29 30 * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC. ··· 122 121 123 122 extern void *vmap(struct page **pages, unsigned int count, 124 123 unsigned long flags, pgprot_t prot); 124 + void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot); 125 125 extern void vunmap(const void *addr); 126 126 127 127 extern int remap_vmalloc_range_partial(struct vm_area_struct *vma, ··· 169 167 unsigned long flags, 170 168 unsigned long start, unsigned long end, 171 169 const void *caller); 170 + void free_vm_area(struct vm_struct *area); 172 171 extern struct vm_struct *remove_vm_area(const void *addr); 173 172 extern struct vm_struct *find_vm_area(const void *addr); 174 173 ··· 204 201 { 205 202 } 206 203 #endif 207 - 208 - /* Allocate/destroy a 'vmalloc' VM area. */ 209 - extern struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes); 210 - extern void free_vm_area(struct vm_struct *area); 211 204 212 205 /* for /dev/kmem */ 213 206 extern long vread(char *buf, char *addr, unsigned long count);
+3 -1
include/uapi/asm-generic/unistd.h
··· 857 857 __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) 858 858 #define __NR_faccessat2 439 859 859 __SYSCALL(__NR_faccessat2, sys_faccessat2) 860 + #define __NR_process_madvise 440 861 + __SYSCALL(__NR_process_madvise, sys_process_madvise) 860 862 861 863 #undef __NR_syscalls 862 - #define __NR_syscalls 440 864 + #define __NR_syscalls 441 863 865 864 866 /* 865 867 * 32 bit systems traditionally used different
-19
kernel/exit.c
··· 1474 1474 return retval; 1475 1475 } 1476 1476 1477 - static struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) 1478 - { 1479 - struct fd f; 1480 - struct pid *pid; 1481 - 1482 - f = fdget(fd); 1483 - if (!f.file) 1484 - return ERR_PTR(-EBADF); 1485 - 1486 - pid = pidfd_pid(f.file); 1487 - if (!IS_ERR(pid)) { 1488 - get_pid(pid); 1489 - *flags = f.file->f_flags; 1490 - } 1491 - 1492 - fdput(f); 1493 - return pid; 1494 - } 1495 - 1496 1477 static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, 1497 1478 int options, struct rusage *ru) 1498 1479 {
+19
kernel/pid.c
··· 520 520 return idr_get_next(&ns->idr, &nr); 521 521 } 522 522 523 + struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) 524 + { 525 + struct fd f; 526 + struct pid *pid; 527 + 528 + f = fdget(fd); 529 + if (!f.file) 530 + return ERR_PTR(-EBADF); 531 + 532 + pid = pidfd_pid(f.file); 533 + if (!IS_ERR(pid)) { 534 + get_pid(pid); 535 + *flags = f.file->f_flags; 536 + } 537 + 538 + fdput(f); 539 + return pid; 540 + } 541 + 523 542 /** 524 543 * pidfd_create() - Create a new pid file descriptor. 525 544 *
+1
kernel/sys_ni.c
··· 280 280 COND_SYSCALL(munlockall); 281 281 COND_SYSCALL(mincore); 282 282 COND_SYSCALL(madvise); 283 + COND_SYSCALL(process_madvise); 283 284 COND_SYSCALL(remap_file_pages); 284 285 COND_SYSCALL(mbind); 285 286 COND_SYSCALL_COMPAT(mbind);
+3
mm/Kconfig
··· 816 816 memory; i.e., memory that is only accessible from the device (or 817 817 group of devices). You likely also want to select HMM_MIRROR. 818 818 819 + config VMAP_PFN 820 + bool 821 + 819 822 config FRAME_VECTOR 820 823 bool 821 824
+12 -3
mm/gup_benchmark.c
··· 72 72 int nr; 73 73 struct page **pages; 74 74 int ret = 0; 75 + bool needs_mmap_lock = 76 + cmd != GUP_FAST_BENCHMARK && cmd != PIN_FAST_BENCHMARK; 75 77 76 78 if (gup->size > ULONG_MAX) 77 79 return -EINVAL; ··· 82 80 pages = kvcalloc(nr_pages, sizeof(void *), GFP_KERNEL); 83 81 if (!pages) 84 82 return -ENOMEM; 83 + 84 + if (needs_mmap_lock && mmap_read_lock_killable(current->mm)) { 85 + ret = -EINTR; 86 + goto free_pages; 87 + } 85 88 86 89 i = 0; 87 90 nr = gup->nr_pages_per_call; ··· 127 120 pages + i, NULL); 128 121 break; 129 122 default: 130 - kvfree(pages); 131 123 ret = -EINVAL; 132 - goto out; 124 + goto unlock; 133 125 } 134 126 135 127 if (nr <= 0) ··· 156 150 end_time = ktime_get(); 157 151 gup->put_delta_usec = ktime_us_delta(end_time, start_time); 158 152 153 + unlock: 154 + if (needs_mmap_lock) 155 + mmap_read_unlock(current->mm); 156 + free_pages: 159 157 kvfree(pages); 160 - out: 161 158 return ret; 162 159 } 163 160
+110 -15
mm/madvise.c
··· 17 17 #include <linux/falloc.h> 18 18 #include <linux/fadvise.h> 19 19 #include <linux/sched.h> 20 + #include <linux/sched/mm.h> 21 + #include <linux/uio.h> 20 22 #include <linux/ksm.h> 21 23 #include <linux/fs.h> 22 24 #include <linux/file.h> ··· 29 27 #include <linux/swapops.h> 30 28 #include <linux/shmem_fs.h> 31 29 #include <linux/mmu_notifier.h> 32 - #include <linux/sched/mm.h> 33 30 34 31 #include <asm/tlb.h> 35 32 ··· 259 258 struct vm_area_struct **prev, 260 259 unsigned long start, unsigned long end) 261 260 { 261 + struct mm_struct *mm = vma->vm_mm; 262 262 struct file *file = vma->vm_file; 263 263 loff_t offset; 264 264 ··· 296 294 get_file(file); 297 295 offset = (loff_t)(start - vma->vm_start) 298 296 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 299 - mmap_read_unlock(current->mm); 297 + mmap_read_unlock(mm); 300 298 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); 301 299 fput(file); 302 - mmap_read_lock(current->mm); 300 + mmap_read_lock(mm); 303 301 return 0; 304 302 } 305 303 ··· 768 766 unsigned long start, unsigned long end, 769 767 int behavior) 770 768 { 769 + struct mm_struct *mm = vma->vm_mm; 770 + 771 771 *prev = vma; 772 772 if (!can_madv_lru_vma(vma)) 773 773 return -EINVAL; ··· 777 773 if (!userfaultfd_remove(vma, start, end)) { 778 774 *prev = NULL; /* mmap_lock has been dropped, prev is stale */ 779 775 780 - mmap_read_lock(current->mm); 781 - vma = find_vma(current->mm, start); 776 + mmap_read_lock(mm); 777 + vma = find_vma(mm, start); 782 778 if (!vma) 783 779 return -ENOMEM; 784 780 if (start < vma->vm_start) { ··· 832 828 loff_t offset; 833 829 int error; 834 830 struct file *f; 831 + struct mm_struct *mm = vma->vm_mm; 835 832 836 833 *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 837 834 ··· 860 855 get_file(f); 861 856 if (userfaultfd_remove(vma, start, end)) { 862 857 /* mmap_lock was not released by userfaultfd_remove() */ 863 - mmap_read_unlock(current->mm); 858 + mmap_read_unlock(mm); 864 859 } 865 860 error = vfs_fallocate(f, 866 861 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 867 862 offset, end - start); 868 863 fput(f); 869 - mmap_read_lock(current->mm); 864 + mmap_read_lock(mm); 870 865 return error; 871 866 } 872 867 ··· 989 984 } 990 985 } 991 986 987 + static bool 988 + process_madvise_behavior_valid(int behavior) 989 + { 990 + switch (behavior) { 991 + case MADV_COLD: 992 + case MADV_PAGEOUT: 993 + return true; 994 + default: 995 + return false; 996 + } 997 + } 998 + 992 999 /* 993 1000 * The madvise(2) system call. 994 1001 * ··· 1048 1031 * MADV_DONTDUMP - the application wants to prevent pages in the given range 1049 1032 * from being included in its core dump. 1050 1033 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. 1034 + * MADV_COLD - the application is not expected to use this memory soon, 1035 + * deactivate pages in this range so that they can be reclaimed 1036 + * easily if memory pressure hanppens. 1037 + * MADV_PAGEOUT - the application is not expected to use this memory soon, 1038 + * page out the pages in this range immediately. 1051 1039 * 1052 1040 * return values: 1053 1041 * zero - success ··· 1067 1045 * -EBADF - map exists, but area maps something that isn't a file. 1068 1046 * -EAGAIN - a kernel resource was temporarily unavailable. 1069 1047 */ 1070 - int do_madvise(unsigned long start, size_t len_in, int behavior) 1048 + int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) 1071 1049 { 1072 1050 unsigned long end, tmp; 1073 1051 struct vm_area_struct *vma, *prev; ··· 1105 1083 1106 1084 write = madvise_need_mmap_write(behavior); 1107 1085 if (write) { 1108 - if (mmap_write_lock_killable(current->mm)) 1086 + if (mmap_write_lock_killable(mm)) 1109 1087 return -EINTR; 1110 1088 } else { 1111 - mmap_read_lock(current->mm); 1089 + mmap_read_lock(mm); 1112 1090 } 1113 1091 1114 1092 /* ··· 1116 1094 * ranges, just ignore them, but return -ENOMEM at the end. 1117 1095 * - different from the way of handling in mlock etc. 1118 1096 */ 1119 - vma = find_vma_prev(current->mm, start, &prev); 1097 + vma = find_vma_prev(mm, start, &prev); 1120 1098 if (vma && start > vma->vm_start) 1121 1099 prev = vma; 1122 1100 ··· 1153 1131 if (prev) 1154 1132 vma = prev->vm_next; 1155 1133 else /* madvise_remove dropped mmap_lock */ 1156 - vma = find_vma(current->mm, start); 1134 + vma = find_vma(mm, start); 1157 1135 } 1158 1136 out: 1159 1137 blk_finish_plug(&plug); 1160 1138 if (write) 1161 - mmap_write_unlock(current->mm); 1139 + mmap_write_unlock(mm); 1162 1140 else 1163 - mmap_read_unlock(current->mm); 1141 + mmap_read_unlock(mm); 1164 1142 1165 1143 return error; 1166 1144 } 1167 1145 1168 1146 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 1169 1147 { 1170 - return do_madvise(start, len_in, behavior); 1148 + return do_madvise(current->mm, start, len_in, behavior); 1149 + } 1150 + 1151 + SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, 1152 + size_t, vlen, int, behavior, unsigned int, flags) 1153 + { 1154 + ssize_t ret; 1155 + struct iovec iovstack[UIO_FASTIOV], iovec; 1156 + struct iovec *iov = iovstack; 1157 + struct iov_iter iter; 1158 + struct pid *pid; 1159 + struct task_struct *task; 1160 + struct mm_struct *mm; 1161 + size_t total_len; 1162 + unsigned int f_flags; 1163 + 1164 + if (flags != 0) { 1165 + ret = -EINVAL; 1166 + goto out; 1167 + } 1168 + 1169 + ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 1170 + if (ret < 0) 1171 + goto out; 1172 + 1173 + pid = pidfd_get_pid(pidfd, &f_flags); 1174 + if (IS_ERR(pid)) { 1175 + ret = PTR_ERR(pid); 1176 + goto free_iov; 1177 + } 1178 + 1179 + task = get_pid_task(pid, PIDTYPE_PID); 1180 + if (!task) { 1181 + ret = -ESRCH; 1182 + goto put_pid; 1183 + } 1184 + 1185 + if (task->mm != current->mm && 1186 + !process_madvise_behavior_valid(behavior)) { 1187 + ret = -EINVAL; 1188 + goto release_task; 1189 + } 1190 + 1191 + mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); 1192 + if (IS_ERR_OR_NULL(mm)) { 1193 + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; 1194 + goto release_task; 1195 + } 1196 + 1197 + total_len = iov_iter_count(&iter); 1198 + 1199 + while (iov_iter_count(&iter)) { 1200 + iovec = iov_iter_iovec(&iter); 1201 + ret = do_madvise(mm, (unsigned long)iovec.iov_base, 1202 + iovec.iov_len, behavior); 1203 + if (ret < 0) 1204 + break; 1205 + iov_iter_advance(&iter, iovec.iov_len); 1206 + } 1207 + 1208 + if (ret == 0) 1209 + ret = total_len - iov_iter_count(&iter); 1210 + 1211 + mmput(mm); 1212 + return ret; 1213 + 1214 + release_task: 1215 + put_task_struct(task); 1216 + put_pid: 1217 + put_pid(pid); 1218 + free_iov: 1219 + kfree(iov); 1220 + out: 1221 + return ret; 1171 1222 }
+54 -21
mm/memcontrol.c
··· 73 73 74 74 struct mem_cgroup *root_mem_cgroup __read_mostly; 75 75 76 + /* Active memory cgroup to use from an interrupt context */ 77 + DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); 78 + 76 79 /* Socket memory accounting disabled? */ 77 80 static bool cgroup_memory_nosocket; 78 81 ··· 1064 1061 } 1065 1062 EXPORT_SYMBOL(get_mem_cgroup_from_page); 1066 1063 1067 - /** 1068 - * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg. 1069 - */ 1070 - static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) 1064 + static __always_inline struct mem_cgroup *active_memcg(void) 1071 1065 { 1072 - if (unlikely(current->active_memcg)) { 1073 - struct mem_cgroup *memcg; 1066 + if (in_interrupt()) 1067 + return this_cpu_read(int_active_memcg); 1068 + else 1069 + return current->active_memcg; 1070 + } 1074 1071 1075 - rcu_read_lock(); 1072 + static __always_inline struct mem_cgroup *get_active_memcg(void) 1073 + { 1074 + struct mem_cgroup *memcg; 1075 + 1076 + rcu_read_lock(); 1077 + memcg = active_memcg(); 1078 + if (memcg) { 1076 1079 /* current->active_memcg must hold a ref. */ 1077 - if (WARN_ON_ONCE(!css_tryget(&current->active_memcg->css))) 1080 + if (WARN_ON_ONCE(!css_tryget(&memcg->css))) 1078 1081 memcg = root_mem_cgroup; 1079 1082 else 1080 1083 memcg = current->active_memcg; 1081 - rcu_read_unlock(); 1082 - return memcg; 1083 1084 } 1085 + rcu_read_unlock(); 1086 + 1087 + return memcg; 1088 + } 1089 + 1090 + static __always_inline bool memcg_kmem_bypass(void) 1091 + { 1092 + /* Allow remote memcg charging from any context. */ 1093 + if (unlikely(active_memcg())) 1094 + return false; 1095 + 1096 + /* Memcg to charge can't be determined. */ 1097 + if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) 1098 + return true; 1099 + 1100 + return false; 1101 + } 1102 + 1103 + /** 1104 + * If active memcg is set, do not fallback to current->mm->memcg. 1105 + */ 1106 + static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) 1107 + { 1108 + if (memcg_kmem_bypass()) 1109 + return NULL; 1110 + 1111 + if (unlikely(active_memcg())) 1112 + return get_active_memcg(); 1113 + 1084 1114 return get_mem_cgroup_from_mm(current->mm); 1085 1115 } 1086 1116 ··· 2969 2933 struct obj_cgroup *objcg = NULL; 2970 2934 struct mem_cgroup *memcg; 2971 2935 2972 - if (unlikely(!current->mm && !current->active_memcg)) 2936 + if (memcg_kmem_bypass()) 2973 2937 return NULL; 2974 2938 2975 2939 rcu_read_lock(); 2976 - if (unlikely(current->active_memcg)) 2977 - memcg = rcu_dereference(current->active_memcg); 2940 + if (unlikely(active_memcg())) 2941 + memcg = active_memcg(); 2978 2942 else 2979 2943 memcg = mem_cgroup_from_task(current); 2980 2944 ··· 3095 3059 struct mem_cgroup *memcg; 3096 3060 int ret = 0; 3097 3061 3098 - if (memcg_kmem_bypass()) 3099 - return 0; 3100 - 3101 3062 memcg = get_mem_cgroup_from_current(); 3102 - if (!mem_cgroup_is_root(memcg)) { 3063 + if (memcg && !mem_cgroup_is_root(memcg)) { 3103 3064 ret = __memcg_kmem_charge(memcg, gfp, 1 << order); 3104 3065 if (!ret) { 3105 3066 page->mem_cgroup = memcg; 3106 3067 __SetPageKmemcg(page); 3107 3068 return 0; 3108 3069 } 3070 + css_put(&memcg->css); 3109 3071 } 3110 - css_put(&memcg->css); 3111 3072 return ret; 3112 3073 } 3113 3074 ··· 5323 5290 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 5324 5291 { 5325 5292 struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); 5326 - struct mem_cgroup *memcg; 5293 + struct mem_cgroup *memcg, *old_memcg; 5327 5294 long error = -ENOMEM; 5328 5295 5329 - memalloc_use_memcg(parent); 5296 + old_memcg = set_active_memcg(parent); 5330 5297 memcg = mem_cgroup_alloc(); 5331 - memalloc_unuse_memcg(); 5298 + set_active_memcg(old_memcg); 5332 5299 if (IS_ERR(memcg)) 5333 5300 return ERR_CAST(memcg); 5334 5301
+6 -12
mm/memory-failure.c
··· 1673 1673 } 1674 1674 EXPORT_SYMBOL(unpoison_memory); 1675 1675 1676 - static struct page *new_page(struct page *p, unsigned long private) 1677 - { 1678 - struct migration_target_control mtc = { 1679 - .nid = page_to_nid(p), 1680 - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, 1681 - }; 1682 - 1683 - return alloc_migration_target(p, (unsigned long)&mtc); 1684 - } 1685 - 1686 1676 /* 1687 1677 * Safely get reference count of an arbitrary page. 1688 1678 * Returns 0 for a free page, -EIO for a zero refcount page ··· 1787 1797 char const *msg_page[] = {"page", "hugepage"}; 1788 1798 bool huge = PageHuge(page); 1789 1799 LIST_HEAD(pagelist); 1800 + struct migration_target_control mtc = { 1801 + .nid = NUMA_NO_NODE, 1802 + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, 1803 + }; 1790 1804 1791 1805 /* 1792 1806 * Check PageHWPoison again inside page lock because PageHWPoison ··· 1827 1833 } 1828 1834 1829 1835 if (isolate_page(hpage, &pagelist)) { 1830 - ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, 1831 - MIGRATE_SYNC, MR_MEMORY_FAILURE); 1836 + ret = migrate_pages(&pagelist, alloc_migration_target, NULL, 1837 + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE); 1832 1838 if (!ret) { 1833 1839 bool release = !huge; 1834 1840
+9 -7
mm/memory.c
··· 2391 2391 2392 2392 arch_enter_lazy_mmu_mode(); 2393 2393 2394 - do { 2395 - if (create || !pte_none(*pte)) { 2396 - err = fn(pte++, addr, data); 2397 - if (err) 2398 - break; 2399 - } 2400 - } while (addr += PAGE_SIZE, addr != end); 2394 + if (fn) { 2395 + do { 2396 + if (create || !pte_none(*pte)) { 2397 + err = fn(pte++, addr, data); 2398 + if (err) 2399 + break; 2400 + } 2401 + } while (addr += PAGE_SIZE, addr != end); 2402 + } 2401 2403 *mask |= PGTBL_PTE_MODIFIED; 2402 2404 2403 2405 arch_leave_lazy_mmu_mode();
+22 -24
mm/memory_hotplug.c
··· 1290 1290 return 0; 1291 1291 } 1292 1292 1293 - static struct page *new_node_page(struct page *page, unsigned long private) 1294 - { 1295 - nodemask_t nmask = node_states[N_MEMORY]; 1296 - struct migration_target_control mtc = { 1297 - .nid = page_to_nid(page), 1298 - .nmask = &nmask, 1299 - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, 1300 - }; 1301 - 1302 - /* 1303 - * try to allocate from a different node but reuse this node if there 1304 - * are no other online nodes to be used (e.g. we are offlining a part 1305 - * of the only existing node) 1306 - */ 1307 - node_clear(mtc.nid, nmask); 1308 - if (nodes_empty(nmask)) 1309 - node_set(mtc.nid, nmask); 1310 - 1311 - return alloc_migration_target(page, (unsigned long)&mtc); 1312 - } 1313 - 1314 1293 static int 1315 1294 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 1316 1295 { ··· 1349 1370 put_page(page); 1350 1371 } 1351 1372 if (!list_empty(&source)) { 1352 - /* Allocate a new page from the nearest neighbor node */ 1353 - ret = migrate_pages(&source, new_node_page, NULL, 0, 1354 - MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1373 + nodemask_t nmask = node_states[N_MEMORY]; 1374 + struct migration_target_control mtc = { 1375 + .nmask = &nmask, 1376 + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, 1377 + }; 1378 + 1379 + /* 1380 + * We have checked that migration range is on a single zone so 1381 + * we can use the nid of the first page to all the others. 1382 + */ 1383 + mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru)); 1384 + 1385 + /* 1386 + * try to allocate from a different node but reuse this node 1387 + * if there are no other online nodes to be used (e.g. we are 1388 + * offlining a part of the only existing node) 1389 + */ 1390 + node_clear(mtc.nid, nmask); 1391 + if (nodes_empty(nmask)) 1392 + node_set(mtc.nid, nmask); 1393 + ret = migrate_pages(&source, alloc_migration_target, NULL, 1394 + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1355 1395 if (ret) { 1356 1396 list_for_each_entry(page, &source, lru) { 1357 1397 pr_warn("migrating pfn %lx failed ret:%d ",
+50 -35
mm/migrate.c
··· 1864 1864 return nr_pages ? -EFAULT : 0; 1865 1865 } 1866 1866 1867 + static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes) 1868 + { 1869 + struct task_struct *task; 1870 + struct mm_struct *mm; 1871 + 1872 + /* 1873 + * There is no need to check if current process has the right to modify 1874 + * the specified process when they are same. 1875 + */ 1876 + if (!pid) { 1877 + mmget(current->mm); 1878 + *mem_nodes = cpuset_mems_allowed(current); 1879 + return current->mm; 1880 + } 1881 + 1882 + /* Find the mm_struct */ 1883 + rcu_read_lock(); 1884 + task = find_task_by_vpid(pid); 1885 + if (!task) { 1886 + rcu_read_unlock(); 1887 + return ERR_PTR(-ESRCH); 1888 + } 1889 + get_task_struct(task); 1890 + 1891 + /* 1892 + * Check if this process has the right to modify the specified 1893 + * process. Use the regular "ptrace_may_access()" checks. 1894 + */ 1895 + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { 1896 + rcu_read_unlock(); 1897 + mm = ERR_PTR(-EPERM); 1898 + goto out; 1899 + } 1900 + rcu_read_unlock(); 1901 + 1902 + mm = ERR_PTR(security_task_movememory(task)); 1903 + if (IS_ERR(mm)) 1904 + goto out; 1905 + *mem_nodes = cpuset_mems_allowed(task); 1906 + mm = get_task_mm(task); 1907 + out: 1908 + put_task_struct(task); 1909 + if (!mm) 1910 + mm = ERR_PTR(-EINVAL); 1911 + return mm; 1912 + } 1913 + 1867 1914 /* 1868 1915 * Move a list of pages in the address space of the currently executing 1869 1916 * process. ··· 1920 1873 const int __user *nodes, 1921 1874 int __user *status, int flags) 1922 1875 { 1923 - struct task_struct *task; 1924 1876 struct mm_struct *mm; 1925 1877 int err; 1926 1878 nodemask_t task_nodes; ··· 1931 1885 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 1932 1886 return -EPERM; 1933 1887 1934 - /* Find the mm_struct */ 1935 - rcu_read_lock(); 1936 - task = pid ? find_task_by_vpid(pid) : current; 1937 - if (!task) { 1938 - rcu_read_unlock(); 1939 - return -ESRCH; 1940 - } 1941 - get_task_struct(task); 1942 - 1943 - /* 1944 - * Check if this process has the right to modify the specified 1945 - * process. Use the regular "ptrace_may_access()" checks. 1946 - */ 1947 - if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { 1948 - rcu_read_unlock(); 1949 - err = -EPERM; 1950 - goto out; 1951 - } 1952 - rcu_read_unlock(); 1953 - 1954 - err = security_task_movememory(task); 1955 - if (err) 1956 - goto out; 1957 - 1958 - task_nodes = cpuset_mems_allowed(task); 1959 - mm = get_task_mm(task); 1960 - put_task_struct(task); 1961 - 1962 - if (!mm) 1963 - return -EINVAL; 1888 + mm = find_mm_struct(pid, &task_nodes); 1889 + if (IS_ERR(mm)) 1890 + return PTR_ERR(mm); 1964 1891 1965 1892 if (nodes) 1966 1893 err = do_pages_move(mm, task_nodes, nr_pages, pages, ··· 1942 1923 err = do_pages_stat(mm, nr_pages, pages, status); 1943 1924 1944 1925 mmput(mm); 1945 - return err; 1946 - 1947 - out: 1948 - put_task_struct(task); 1949 1926 return err; 1950 1927 } 1951 1928
+53 -21
mm/mmap.c
··· 558 558 return 0; 559 559 } 560 560 561 + /* 562 + * vma_next() - Get the next VMA. 563 + * @mm: The mm_struct. 564 + * @vma: The current vma. 565 + * 566 + * If @vma is NULL, return the first vma in the mm. 567 + * 568 + * Returns: The next VMA after @vma. 569 + */ 570 + static inline struct vm_area_struct *vma_next(struct mm_struct *mm, 571 + struct vm_area_struct *vma) 572 + { 573 + if (!vma) 574 + return mm->mmap; 575 + 576 + return vma->vm_next; 577 + } 578 + 579 + /* 580 + * munmap_vma_range() - munmap VMAs that overlap a range. 581 + * @mm: The mm struct 582 + * @start: The start of the range. 583 + * @len: The length of the range. 584 + * @pprev: pointer to the pointer that will be set to previous vm_area_struct 585 + * @rb_link: the rb_node 586 + * @rb_parent: the parent rb_node 587 + * 588 + * Find all the vm_area_struct that overlap from @start to 589 + * @end and munmap them. Set @pprev to the previous vm_area_struct. 590 + * 591 + * Returns: -ENOMEM on munmap failure or 0 on success. 592 + */ 593 + static inline int 594 + munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len, 595 + struct vm_area_struct **pprev, struct rb_node ***link, 596 + struct rb_node **parent, struct list_head *uf) 597 + { 598 + 599 + while (find_vma_links(mm, start, start + len, pprev, link, parent)) 600 + if (do_munmap(mm, start, len, uf)) 601 + return -ENOMEM; 602 + 603 + return 0; 604 + } 561 605 static unsigned long count_vma_pages_range(struct mm_struct *mm, 562 606 unsigned long addr, unsigned long end) 563 607 { ··· 1172 1128 if (vm_flags & VM_SPECIAL) 1173 1129 return NULL; 1174 1130 1175 - if (prev) 1176 - next = prev->vm_next; 1177 - else 1178 - next = mm->mmap; 1131 + next = vma_next(mm, prev); 1179 1132 area = next; 1180 1133 if (area && area->vm_end == end) /* cases 6, 7, 8 */ 1181 1134 next = next->vm_next; ··· 1748 1707 return -ENOMEM; 1749 1708 } 1750 1709 1751 - /* Clear old maps */ 1752 - while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, 1753 - &rb_parent)) { 1754 - if (do_munmap(mm, addr, len, uf)) 1755 - return -ENOMEM; 1756 - } 1757 - 1710 + /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ 1711 + if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) 1712 + return -ENOMEM; 1758 1713 /* 1759 1714 * Private writable mapping: check memory availability 1760 1715 */ ··· 2669 2632 struct vm_area_struct *vma, struct vm_area_struct *prev, 2670 2633 unsigned long start, unsigned long end) 2671 2634 { 2672 - struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap; 2635 + struct vm_area_struct *next = vma_next(mm, prev); 2673 2636 struct mmu_gather tlb; 2674 2637 2675 2638 lru_add_drain(); ··· 2868 2831 if (error) 2869 2832 return error; 2870 2833 } 2871 - vma = prev ? prev->vm_next : mm->mmap; 2834 + vma = vma_next(mm, prev); 2872 2835 2873 2836 if (unlikely(uf)) { 2874 2837 /* ··· 3086 3049 if (error) 3087 3050 return error; 3088 3051 3089 - /* 3090 - * Clear old maps. this also does some error checking for us 3091 - */ 3092 - while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, 3093 - &rb_parent)) { 3094 - if (do_munmap(mm, addr, len, uf)) 3095 - return -ENOMEM; 3096 - } 3052 + /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ 3053 + if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) 3054 + return -ENOMEM; 3097 3055 3098 3056 /* Check against address space limits *after* clearing old maps... */ 3099 3057 if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
-7
mm/nommu.c
··· 354 354 } 355 355 EXPORT_SYMBOL_GPL(vm_unmap_aliases); 356 356 357 - struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) 358 - { 359 - BUG(); 360 - return NULL; 361 - } 362 - EXPORT_SYMBOL_GPL(alloc_vm_area); 363 - 364 357 void free_vm_area(struct vm_struct *area) 365 358 { 366 359 BUG();
+1 -2
mm/percpu.c
··· 1584 1584 { 1585 1585 struct obj_cgroup *objcg; 1586 1586 1587 - if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT) || 1588 - memcg_kmem_bypass()) 1587 + if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT)) 1589 1588 return PCPU_CHUNK_ROOT; 1590 1589 1591 1590 objcg = get_obj_cgroup_from_current();
-3
mm/slab.h
··· 280 280 { 281 281 struct obj_cgroup *objcg; 282 282 283 - if (memcg_kmem_bypass()) 284 - return NULL; 285 - 286 283 objcg = get_obj_cgroup_from_current(); 287 284 if (!objcg) 288 285 return NULL;
+73 -74
mm/vmalloc.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0-only 2 2 /* 3 - * linux/mm/vmalloc.c 4 - * 5 3 * Copyright (C) 1993 Linus Torvalds 6 4 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 7 5 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 ··· 2319 2321 } 2320 2322 2321 2323 /** 2322 - * vfree - release memory allocated by vmalloc() 2323 - * @addr: memory base address 2324 + * vfree - Release memory allocated by vmalloc() 2325 + * @addr: Memory base address 2324 2326 * 2325 - * Free the virtually continuous memory area starting at @addr, as 2326 - * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is 2327 - * NULL, no operation is performed. 2327 + * Free the virtually continuous memory area starting at @addr, as obtained 2328 + * from one of the vmalloc() family of APIs. This will usually also free the 2329 + * physical memory underlying the virtual allocation, but that memory is 2330 + * reference counted, so it will not be freed until the last user goes away. 2328 2331 * 2329 - * Must not be called in NMI context (strictly speaking, only if we don't 2330 - * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling 2331 - * conventions for vfree() arch-depenedent would be a really bad idea) 2332 + * If @addr is NULL, no operation is performed. 2332 2333 * 2334 + * Context: 2333 2335 * May sleep if called *not* from interrupt context. 2334 - * 2335 - * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) 2336 + * Must not be called in NMI context (strictly speaking, it could be 2337 + * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling 2338 + * conventions for vfree() arch-depenedent would be a really bad idea). 2336 2339 */ 2337 2340 void vfree(const void *addr) 2338 2341 { ··· 2375 2376 * @flags: vm_area->flags 2376 2377 * @prot: page protection for the mapping 2377 2378 * 2378 - * Maps @count pages from @pages into contiguous kernel virtual 2379 - * space. 2379 + * Maps @count pages from @pages into contiguous kernel virtual space. 2380 + * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself 2381 + * (which must be kmalloc or vmalloc memory) and one reference per pages in it 2382 + * are transferred from the caller to vmap(), and will be freed / dropped when 2383 + * vfree() is called on the return value. 2380 2384 * 2381 2385 * Return: the address of the area or %NULL on failure 2382 2386 */ ··· 2405 2403 return NULL; 2406 2404 } 2407 2405 2406 + if (flags & VM_MAP_PUT_PAGES) 2407 + area->pages = pages; 2408 2408 return area->addr; 2409 2409 } 2410 2410 EXPORT_SYMBOL(vmap); 2411 2411 2412 + #ifdef CONFIG_VMAP_PFN 2413 + struct vmap_pfn_data { 2414 + unsigned long *pfns; 2415 + pgprot_t prot; 2416 + unsigned int idx; 2417 + }; 2418 + 2419 + static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) 2420 + { 2421 + struct vmap_pfn_data *data = private; 2422 + 2423 + if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx]))) 2424 + return -EINVAL; 2425 + *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot)); 2426 + return 0; 2427 + } 2428 + 2429 + /** 2430 + * vmap_pfn - map an array of PFNs into virtually contiguous space 2431 + * @pfns: array of PFNs 2432 + * @count: number of pages to map 2433 + * @prot: page protection for the mapping 2434 + * 2435 + * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns 2436 + * the start address of the mapping. 2437 + */ 2438 + void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) 2439 + { 2440 + struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) }; 2441 + struct vm_struct *area; 2442 + 2443 + area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP, 2444 + __builtin_return_address(0)); 2445 + if (!area) 2446 + return NULL; 2447 + if (apply_to_page_range(&init_mm, (unsigned long)area->addr, 2448 + count * PAGE_SIZE, vmap_pfn_apply, &data)) { 2449 + free_vm_area(area); 2450 + return NULL; 2451 + } 2452 + return area->addr; 2453 + } 2454 + EXPORT_SYMBOL_GPL(vmap_pfn); 2455 + #endif /* CONFIG_VMAP_PFN */ 2456 + 2412 2457 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 2413 2458 pgprot_t prot, int node) 2414 2459 { 2415 - struct page **pages; 2416 - unsigned int nr_pages, array_size, i; 2417 2460 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 2418 - const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; 2419 - const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ? 2420 - 0 : 2421 - __GFP_HIGHMEM; 2461 + unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; 2462 + unsigned int array_size = nr_pages * sizeof(struct page *), i; 2463 + struct page **pages; 2422 2464 2423 - nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; 2424 - array_size = (nr_pages * sizeof(struct page *)); 2465 + gfp_mask |= __GFP_NOWARN; 2466 + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) 2467 + gfp_mask |= __GFP_HIGHMEM; 2425 2468 2426 2469 /* Please note that the recursion is strictly bounded. */ 2427 2470 if (array_size > PAGE_SIZE) { 2428 - pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, 2429 - node, area->caller); 2471 + pages = __vmalloc_node(array_size, 1, nested_gfp, node, 2472 + area->caller); 2430 2473 } else { 2431 2474 pages = kmalloc_node(array_size, nested_gfp, node); 2432 2475 } ··· 2489 2442 struct page *page; 2490 2443 2491 2444 if (node == NUMA_NO_NODE) 2492 - page = alloc_page(alloc_mask|highmem_mask); 2445 + page = alloc_page(gfp_mask); 2493 2446 else 2494 - page = alloc_pages_node(node, alloc_mask|highmem_mask, 0); 2447 + page = alloc_pages_node(node, gfp_mask, 0); 2495 2448 2496 2449 if (unlikely(!page)) { 2497 2450 /* Successfully allocated i pages, free them in __vfree() */ ··· 3078 3031 vma->vm_end - vma->vm_start); 3079 3032 } 3080 3033 EXPORT_SYMBOL(remap_vmalloc_range); 3081 - 3082 - static int f(pte_t *pte, unsigned long addr, void *data) 3083 - { 3084 - pte_t ***p = data; 3085 - 3086 - if (p) { 3087 - *(*p) = pte; 3088 - (*p)++; 3089 - } 3090 - return 0; 3091 - } 3092 - 3093 - /** 3094 - * alloc_vm_area - allocate a range of kernel address space 3095 - * @size: size of the area 3096 - * @ptes: returns the PTEs for the address space 3097 - * 3098 - * Returns: NULL on failure, vm_struct on success 3099 - * 3100 - * This function reserves a range of kernel address space, and 3101 - * allocates pagetables to map that range. No actual mappings 3102 - * are created. 3103 - * 3104 - * If @ptes is non-NULL, pointers to the PTEs (in init_mm) 3105 - * allocated for the VM area are returned. 3106 - */ 3107 - struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) 3108 - { 3109 - struct vm_struct *area; 3110 - 3111 - area = get_vm_area_caller(size, VM_IOREMAP, 3112 - __builtin_return_address(0)); 3113 - if (area == NULL) 3114 - return NULL; 3115 - 3116 - /* 3117 - * This ensures that page tables are constructed for this region 3118 - * of kernel virtual address space and mapped into init_mm. 3119 - */ 3120 - if (apply_to_page_range(&init_mm, (unsigned long)area->addr, 3121 - size, f, ptes ? &ptes : NULL)) { 3122 - free_vm_area(area); 3123 - return NULL; 3124 - } 3125 - 3126 - return area; 3127 - } 3128 - EXPORT_SYMBOL_GPL(alloc_vm_area); 3129 3034 3130 3035 void free_vm_area(struct vm_struct *area) 3131 3036 {
+8 -2
mm/zsmalloc.c
··· 1122 1122 */ 1123 1123 if (area->vm) 1124 1124 return 0; 1125 - area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); 1125 + area->vm = get_vm_area(PAGE_SIZE * 2, 0); 1126 1126 if (!area->vm) 1127 1127 return -ENOMEM; 1128 - return 0; 1128 + 1129 + /* 1130 + * Populate ptes in advance to avoid pte allocation with GFP_KERNEL 1131 + * in non-preemtible context of zs_map_object. 1132 + */ 1133 + return apply_to_page_range(&init_mm, (unsigned long)area->vm->addr, 1134 + PAGE_SIZE * 2, NULL, NULL); 1129 1135 } 1130 1136 1131 1137 static inline void __zs_cpu_down(struct mapping_area *area)
+1 -1
tools/testing/selftests/vm/hmm-tests.c
··· 45 45 #define TWOMEG (1 << 21) 46 46 #define HMM_BUFFER_SIZE (1024 << 12) 47 47 #define HMM_PATH_MAX 64 48 - #define NTIMES 256 48 + #define NTIMES 10 49 49 50 50 #define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) 51 51