Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
"28 patches.

Subsystems affected by this series: mm (memblock, pagealloc, hugetlb,
highmem, kfence, oom-kill, madvise, kasan, userfaultfd, memcg, and
zram), core-kernel, kconfig, fork, binfmt, MAINTAINERS, kbuild, and
ia64"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (28 commits)
zram: fix broken page writeback
zram: fix return value on writeback_store
mm/memcg: set memcg when splitting page
mm/memcg: rename mem_cgroup_split_huge_fixup to split_page_memcg and add nr_pages argument
ia64: fix ptrace(PTRACE_SYSCALL_INFO_EXIT) sign
ia64: fix ia64_syscall_get_set_arguments() for break-based syscalls
mm/userfaultfd: fix memory corruption due to writeprotect
kasan: fix KASAN_STACK dependency for HW_TAGS
kasan, mm: fix crash with HW_TAGS and DEBUG_PAGEALLOC
mm/madvise: replace ptrace attach requirement for process_madvise
include/linux/sched/mm.h: use rcu_dereference in in_vfork()
kfence: fix reports if constant function prefixes exist
kfence, slab: fix cache_alloc_debugcheck_after() for bulk allocations
kfence: fix printk format for ptrdiff_t
linux/compiler-clang.h: define HAVE_BUILTIN_BSWAP*
MAINTAINERS: exclude uapi directories in API/ABI section
binfmt_misc: fix possible deadlock in bm_register_write
mm/highmem.c: fix zero_user_segments() with start > end
hugetlb: do early cow when page pinned on src mm
mm: use is_cow_mapping() across tree where proper
...

+332 -214
+2 -2
MAINTAINERS
··· 261 261 L: linux-api@vger.kernel.org 262 262 F: include/linux/syscalls.h 263 263 F: kernel/sys_ni.c 264 - F: include/uapi/ 265 - F: arch/*/include/uapi/ 264 + X: include/uapi/ 265 + X: arch/*/include/uapi/ 266 266 267 267 ABIT UGURU 1,2 HARDWARE MONITOR DRIVER 268 268 M: Hans de Goede <hdegoede@redhat.com>
+1 -1
arch/ia64/include/asm/syscall.h
··· 32 32 static inline long syscall_get_error(struct task_struct *task, 33 33 struct pt_regs *regs) 34 34 { 35 - return regs->r10 == -1 ? regs->r8:0; 35 + return regs->r10 == -1 ? -regs->r8:0; 36 36 } 37 37 38 38 static inline long syscall_get_return_value(struct task_struct *task,
+18 -6
arch/ia64/kernel/ptrace.c
··· 2013 2013 { 2014 2014 struct syscall_get_set_args *args = data; 2015 2015 struct pt_regs *pt = args->regs; 2016 - unsigned long *krbs, cfm, ndirty; 2016 + unsigned long *krbs, cfm, ndirty, nlocals, nouts; 2017 2017 int i, count; 2018 2018 2019 2019 if (unw_unwind_to_user(info) < 0) 2020 2020 return; 2021 2021 2022 + /* 2023 + * We get here via a few paths: 2024 + * - break instruction: cfm is shared with caller. 2025 + * syscall args are in out= regs, locals are non-empty. 2026 + * - epsinstruction: cfm is set by br.call 2027 + * locals don't exist. 2028 + * 2029 + * For both cases argguments are reachable in cfm.sof - cfm.sol. 2030 + * CFM: [ ... | sor: 17..14 | sol : 13..7 | sof : 6..0 ] 2031 + */ 2022 2032 cfm = pt->cr_ifs; 2033 + nlocals = (cfm >> 7) & 0x7f; /* aka sol */ 2034 + nouts = (cfm & 0x7f) - nlocals; /* aka sof - sol */ 2023 2035 krbs = (unsigned long *)info->task + IA64_RBS_OFFSET/8; 2024 2036 ndirty = ia64_rse_num_regs(krbs, krbs + (pt->loadrs >> 19)); 2025 2037 2026 2038 count = 0; 2027 2039 if (in_syscall(pt)) 2028 - count = min_t(int, args->n, cfm & 0x7f); 2040 + count = min_t(int, args->n, nouts); 2029 2041 2042 + /* Iterate over outs. */ 2030 2043 for (i = 0; i < count; i++) { 2044 + int j = ndirty + nlocals + i + args->i; 2031 2045 if (args->rw) 2032 - *ia64_rse_skip_regs(krbs, ndirty + i + args->i) = 2033 - args->args[i]; 2046 + *ia64_rse_skip_regs(krbs, j) = args->args[i]; 2034 2047 else 2035 - args->args[i] = *ia64_rse_skip_regs(krbs, 2036 - ndirty + i + args->i); 2048 + args->args[i] = *ia64_rse_skip_regs(krbs, j); 2037 2049 } 2038 2050 2039 2051 if (!args->rw) {
+11 -6
drivers/block/zram/zram_drv.c
··· 627 627 struct bio_vec bio_vec; 628 628 struct page *page; 629 629 ssize_t ret = len; 630 - int mode; 630 + int mode, err; 631 631 unsigned long blk_idx = 0; 632 632 633 633 if (sysfs_streq(buf, "idle")) ··· 638 638 if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1)) 639 639 return -EINVAL; 640 640 641 - ret = kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index); 642 - if (ret || index >= nr_pages) 641 + if (kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index) || 642 + index >= nr_pages) 643 643 return -EINVAL; 644 644 645 645 nr_pages = 1; ··· 663 663 goto release_init_lock; 664 664 } 665 665 666 - while (nr_pages--) { 666 + for (; nr_pages != 0; index++, nr_pages--) { 667 667 struct bio_vec bvec; 668 668 669 669 bvec.bv_page = page; ··· 728 728 * XXX: A single page IO would be inefficient for write 729 729 * but it would be not bad as starter. 730 730 */ 731 - ret = submit_bio_wait(&bio); 732 - if (ret) { 731 + err = submit_bio_wait(&bio); 732 + if (err) { 733 733 zram_slot_lock(zram, index); 734 734 zram_clear_flag(zram, index, ZRAM_UNDER_WB); 735 735 zram_clear_flag(zram, index, ZRAM_IDLE); 736 736 zram_slot_unlock(zram, index); 737 + /* 738 + * Return last IO error unless every IO were 739 + * not suceeded. 740 + */ 741 + ret = err; 737 742 continue; 738 743 } 739 744
+1 -3
drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
··· 500 500 vm_fault_t ret; 501 501 pgoff_t fault_page_size; 502 502 bool write = vmf->flags & FAULT_FLAG_WRITE; 503 - bool is_cow_mapping = 504 - (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 505 503 506 504 switch (pe_size) { 507 505 case PE_SIZE_PMD: ··· 516 518 } 517 519 518 520 /* Always do write dirty-tracking and COW on PTE level. */ 519 - if (write && (READ_ONCE(vbo->dirty) || is_cow_mapping)) 521 + if (write && (READ_ONCE(vbo->dirty) || is_cow_mapping(vma->vm_flags))) 520 522 return VM_FAULT_FALLBACK; 521 523 522 524 ret = ttm_bo_vm_reserve(bo, vmf);
+1 -1
drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c
··· 49 49 vma->vm_ops = &vmw_vm_ops; 50 50 51 51 /* Use VM_PFNMAP rather than VM_MIXEDMAP if not a COW mapping */ 52 - if ((vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) != VM_MAYWRITE) 52 + if (!is_cow_mapping(vma->vm_flags)) 53 53 vma->vm_flags = (vma->vm_flags & ~VM_MIXEDMAP) | VM_PFNMAP; 54 54 55 55 return 0;
+14 -15
fs/binfmt_misc.c
··· 649 649 struct super_block *sb = file_inode(file)->i_sb; 650 650 struct dentry *root = sb->s_root, *dentry; 651 651 int err = 0; 652 + struct file *f = NULL; 652 653 653 654 e = create_entry(buffer, count); 654 655 655 656 if (IS_ERR(e)) 656 657 return PTR_ERR(e); 658 + 659 + if (e->flags & MISC_FMT_OPEN_FILE) { 660 + f = open_exec(e->interpreter); 661 + if (IS_ERR(f)) { 662 + pr_notice("register: failed to install interpreter file %s\n", 663 + e->interpreter); 664 + kfree(e); 665 + return PTR_ERR(f); 666 + } 667 + e->interp_file = f; 668 + } 657 669 658 670 inode_lock(d_inode(root)); 659 671 dentry = lookup_one_len(e->name, root, strlen(e->name)); ··· 690 678 goto out2; 691 679 } 692 680 693 - if (e->flags & MISC_FMT_OPEN_FILE) { 694 - struct file *f; 695 - 696 - f = open_exec(e->interpreter); 697 - if (IS_ERR(f)) { 698 - err = PTR_ERR(f); 699 - pr_notice("register: failed to install interpreter file %s\n", e->interpreter); 700 - simple_release_fs(&bm_mnt, &entry_count); 701 - iput(inode); 702 - inode = NULL; 703 - goto out2; 704 - } 705 - e->interp_file = f; 706 - } 707 - 708 681 e->dentry = dget(dentry); 709 682 inode->i_private = e; 710 683 inode->i_fop = &bm_entry_operations; ··· 706 709 inode_unlock(d_inode(root)); 707 710 708 711 if (err) { 712 + if (f) 713 + filp_close(f, NULL); 709 714 kfree(e); 710 715 return err; 711 716 }
-2
fs/proc/task_mmu.c
··· 1036 1036 1037 1037 #ifdef CONFIG_MEM_SOFT_DIRTY 1038 1038 1039 - #define is_cow_mapping(flags) (((flags) & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) 1040 - 1041 1039 static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) 1042 1040 { 1043 1041 struct page *page;
+6
include/linux/compiler-clang.h
··· 31 31 #define __no_sanitize_thread 32 32 #endif 33 33 34 + #if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) 35 + #define __HAVE_BUILTIN_BSWAP32__ 36 + #define __HAVE_BUILTIN_BSWAP64__ 37 + #define __HAVE_BUILTIN_BSWAP16__ 38 + #endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */ 39 + 34 40 #if __has_feature(undefined_behavior_sanitizer) 35 41 /* GCC does not have __SANITIZE_UNDEFINED__ */ 36 42 #define __no_sanitize_undefined \
+2 -2
include/linux/memblock.h
··· 460 460 /* 461 461 * Set the allocation direction to bottom-up or top-down. 462 462 */ 463 - static inline void memblock_set_bottom_up(bool enable) 463 + static inline __init void memblock_set_bottom_up(bool enable) 464 464 { 465 465 memblock.bottom_up = enable; 466 466 } ··· 470 470 * if this is true, that said, memblock will allocate memory 471 471 * in bottom-up direction. 472 472 */ 473 - static inline bool memblock_bottom_up(void) 473 + static inline __init bool memblock_bottom_up(void) 474 474 { 475 475 return memblock.bottom_up; 476 476 }
+2 -4
include/linux/memcontrol.h
··· 1061 1061 rcu_read_unlock(); 1062 1062 } 1063 1063 1064 - #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1065 - void mem_cgroup_split_huge_fixup(struct page *head); 1066 - #endif 1064 + void split_page_memcg(struct page *head, unsigned int nr); 1067 1065 1068 1066 #else /* CONFIG_MEMCG */ 1069 1067 ··· 1398 1400 return 0; 1399 1401 } 1400 1402 1401 - static inline void mem_cgroup_split_huge_fixup(struct page *head) 1403 + static inline void split_page_memcg(struct page *head, unsigned int nr) 1402 1404 { 1403 1405 } 1404 1406
+21
include/linux/mm.h
··· 1300 1300 GUP_PIN_COUNTING_BIAS; 1301 1301 } 1302 1302 1303 + static inline bool is_cow_mapping(vm_flags_t flags) 1304 + { 1305 + return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 1306 + } 1307 + 1308 + /* 1309 + * This should most likely only be called during fork() to see whether we 1310 + * should break the cow immediately for a page on the src mm. 1311 + */ 1312 + static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, 1313 + struct page *page) 1314 + { 1315 + if (!is_cow_mapping(vma->vm_flags)) 1316 + return false; 1317 + 1318 + if (!atomic_read(&vma->vm_mm->has_pinned)) 1319 + return false; 1320 + 1321 + return page_maybe_dma_pinned(page); 1322 + } 1323 + 1303 1324 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) 1304 1325 #define SECTION_IN_PAGE_FLAGS 1305 1326 #endif
+1
include/linux/mm_types.h
··· 23 23 #endif 24 24 #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) 25 25 26 + #define INIT_PASID 0 26 27 27 28 struct address_space; 28 29 struct mem_cgroup;
+2 -1
include/linux/sched/mm.h
··· 140 140 * another oom-unkillable task does this it should blame itself. 141 141 */ 142 142 rcu_read_lock(); 143 - ret = tsk->vfork_done && tsk->real_parent->mm == tsk->mm; 143 + ret = tsk->vfork_done && 144 + rcu_dereference(tsk->real_parent)->mm == tsk->mm; 144 145 rcu_read_unlock(); 145 146 146 147 return ret;
+6 -5
include/linux/stop_machine.h
··· 128 128 const struct cpumask *cpus); 129 129 #else /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ 130 130 131 - static inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, 131 + static __always_inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, 132 132 const struct cpumask *cpus) 133 133 { 134 134 unsigned long flags; ··· 139 139 return ret; 140 140 } 141 141 142 - static inline int stop_machine(cpu_stop_fn_t fn, void *data, 143 - const struct cpumask *cpus) 142 + static __always_inline int 143 + stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) 144 144 { 145 145 return stop_machine_cpuslocked(fn, data, cpus); 146 146 } 147 147 148 - static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, 149 - const struct cpumask *cpus) 148 + static __always_inline int 149 + stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, 150 + const struct cpumask *cpus) 150 151 { 151 152 return stop_machine(fn, data, cpus); 152 153 }
+1 -2
init/Kconfig
··· 119 119 120 120 config COMPILE_TEST 121 121 bool "Compile also drivers which will not load" 122 - depends on !UML && !S390 123 - default n 122 + depends on HAS_IOMEM 124 123 help 125 124 Some drivers can be compiled on a different platform than they are 126 125 intended to be run on. Despite they cannot be loaded there (or even
+8
kernel/fork.c
··· 994 994 #endif 995 995 } 996 996 997 + static void mm_init_pasid(struct mm_struct *mm) 998 + { 999 + #ifdef CONFIG_IOMMU_SUPPORT 1000 + mm->pasid = INIT_PASID; 1001 + #endif 1002 + } 1003 + 997 1004 static void mm_init_uprobes_state(struct mm_struct *mm) 998 1005 { 999 1006 #ifdef CONFIG_UPROBES ··· 1031 1024 mm_init_cpumask(mm); 1032 1025 mm_init_aio(mm); 1033 1026 mm_init_owner(mm, p); 1027 + mm_init_pasid(mm); 1034 1028 RCU_INIT_POINTER(mm->exe_file, NULL); 1035 1029 mmu_notifier_subscriptions_init(mm); 1036 1030 init_tlb_flush_pending(mm);
+1
lib/Kconfig.kasan
··· 156 156 157 157 config KASAN_STACK 158 158 int 159 + depends on KASAN_GENERIC || KASAN_SW_TAGS 159 160 default 1 if KASAN_STACK_ENABLE || CC_IS_GCC 160 161 default 0 161 162
+12 -5
mm/highmem.c
··· 368 368 369 369 BUG_ON(end1 > page_size(page) || end2 > page_size(page)); 370 370 371 + if (start1 >= end1) 372 + start1 = end1 = 0; 373 + if (start2 >= end2) 374 + start2 = end2 = 0; 375 + 371 376 for (i = 0; i < compound_nr(page); i++) { 372 377 void *kaddr = NULL; 373 - 374 - if (start1 < PAGE_SIZE || start2 < PAGE_SIZE) 375 - kaddr = kmap_atomic(page + i); 376 378 377 379 if (start1 >= PAGE_SIZE) { 378 380 start1 -= PAGE_SIZE; ··· 382 380 } else { 383 381 unsigned this_end = min_t(unsigned, end1, PAGE_SIZE); 384 382 385 - if (end1 > start1) 383 + if (end1 > start1) { 384 + kaddr = kmap_atomic(page + i); 386 385 memset(kaddr + start1, 0, this_end - start1); 386 + } 387 387 end1 -= this_end; 388 388 start1 = 0; 389 389 } ··· 396 392 } else { 397 393 unsigned this_end = min_t(unsigned, end2, PAGE_SIZE); 398 394 399 - if (end2 > start2) 395 + if (end2 > start2) { 396 + if (!kaddr) 397 + kaddr = kmap_atomic(page + i); 400 398 memset(kaddr + start2, 0, this_end - start2); 399 + } 401 400 end2 -= this_end; 402 401 start2 = 0; 403 402 }
+3 -7
mm/huge_memory.c
··· 1100 1100 * best effort that the pinned pages won't be replaced by another 1101 1101 * random page during the coming copy-on-write. 1102 1102 */ 1103 - if (unlikely(is_cow_mapping(vma->vm_flags) && 1104 - atomic_read(&src_mm->has_pinned) && 1105 - page_maybe_dma_pinned(src_page))) { 1103 + if (unlikely(page_needs_cow_for_dma(vma, src_page))) { 1106 1104 pte_free(dst_mm, pgtable); 1107 1105 spin_unlock(src_ptl); 1108 1106 spin_unlock(dst_ptl); ··· 1212 1214 } 1213 1215 1214 1216 /* Please refer to comments in copy_huge_pmd() */ 1215 - if (unlikely(is_cow_mapping(vma->vm_flags) && 1216 - atomic_read(&src_mm->has_pinned) && 1217 - page_maybe_dma_pinned(pud_page(pud)))) { 1217 + if (unlikely(page_needs_cow_for_dma(vma, pud_page(pud)))) { 1218 1218 spin_unlock(src_ptl); 1219 1219 spin_unlock(dst_ptl); 1220 1220 __split_huge_pud(vma, src_pud, addr); ··· 2467 2471 int i; 2468 2472 2469 2473 /* complete memcg works before add pages to LRU */ 2470 - mem_cgroup_split_huge_fixup(head); 2474 + split_page_memcg(head, nr); 2471 2475 2472 2476 if (PageAnon(head) && PageSwapCache(head)) { 2473 2477 swp_entry_t entry = { .val = page_private(head) };
+91 -32
mm/hugetlb.c
··· 331 331 } 332 332 } 333 333 334 + static inline long 335 + hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from, 336 + long to, struct hstate *h, struct hugetlb_cgroup *cg, 337 + long *regions_needed) 338 + { 339 + struct file_region *nrg; 340 + 341 + if (!regions_needed) { 342 + nrg = get_file_region_entry_from_cache(map, from, to); 343 + record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg); 344 + list_add(&nrg->link, rg->link.prev); 345 + coalesce_file_region(map, nrg); 346 + } else 347 + *regions_needed += 1; 348 + 349 + return to - from; 350 + } 351 + 334 352 /* 335 353 * Must be called with resv->lock held. 336 354 * ··· 364 346 long add = 0; 365 347 struct list_head *head = &resv->regions; 366 348 long last_accounted_offset = f; 367 - struct file_region *rg = NULL, *trg = NULL, *nrg = NULL; 349 + struct file_region *rg = NULL, *trg = NULL; 368 350 369 351 if (regions_needed) 370 352 *regions_needed = 0; ··· 387 369 /* When we find a region that starts beyond our range, we've 388 370 * finished. 389 371 */ 390 - if (rg->from > t) 372 + if (rg->from >= t) 391 373 break; 392 374 393 375 /* Add an entry for last_accounted_offset -> rg->from, and 394 376 * update last_accounted_offset. 395 377 */ 396 - if (rg->from > last_accounted_offset) { 397 - add += rg->from - last_accounted_offset; 398 - if (!regions_needed) { 399 - nrg = get_file_region_entry_from_cache( 400 - resv, last_accounted_offset, rg->from); 401 - record_hugetlb_cgroup_uncharge_info(h_cg, h, 402 - resv, nrg); 403 - list_add(&nrg->link, rg->link.prev); 404 - coalesce_file_region(resv, nrg); 405 - } else 406 - *regions_needed += 1; 407 - } 378 + if (rg->from > last_accounted_offset) 379 + add += hugetlb_resv_map_add(resv, rg, 380 + last_accounted_offset, 381 + rg->from, h, h_cg, 382 + regions_needed); 408 383 409 384 last_accounted_offset = rg->to; 410 385 } ··· 405 394 /* Handle the case where our range extends beyond 406 395 * last_accounted_offset. 407 396 */ 408 - if (last_accounted_offset < t) { 409 - add += t - last_accounted_offset; 410 - if (!regions_needed) { 411 - nrg = get_file_region_entry_from_cache( 412 - resv, last_accounted_offset, t); 413 - record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg); 414 - list_add(&nrg->link, rg->link.prev); 415 - coalesce_file_region(resv, nrg); 416 - } else 417 - *regions_needed += 1; 418 - } 397 + if (last_accounted_offset < t) 398 + add += hugetlb_resv_map_add(resv, rg, last_accounted_offset, 399 + t, h, h_cg, regions_needed); 419 400 420 401 VM_BUG_ON(add < 0); 421 402 return add; ··· 3728 3725 return false; 3729 3726 } 3730 3727 3728 + static void 3729 + hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, 3730 + struct page *new_page) 3731 + { 3732 + __SetPageUptodate(new_page); 3733 + set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); 3734 + hugepage_add_new_anon_rmap(new_page, vma, addr); 3735 + hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); 3736 + ClearHPageRestoreReserve(new_page); 3737 + SetHPageMigratable(new_page); 3738 + } 3739 + 3731 3740 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 3732 3741 struct vm_area_struct *vma) 3733 3742 { 3734 3743 pte_t *src_pte, *dst_pte, entry, dst_entry; 3735 3744 struct page *ptepage; 3736 3745 unsigned long addr; 3737 - int cow; 3746 + bool cow = is_cow_mapping(vma->vm_flags); 3738 3747 struct hstate *h = hstate_vma(vma); 3739 3748 unsigned long sz = huge_page_size(h); 3749 + unsigned long npages = pages_per_huge_page(h); 3740 3750 struct address_space *mapping = vma->vm_file->f_mapping; 3741 3751 struct mmu_notifier_range range; 3742 3752 int ret = 0; 3743 - 3744 - cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 3745 3753 3746 3754 if (cow) { 3747 3755 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src, ··· 3798 3784 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 3799 3785 entry = huge_ptep_get(src_pte); 3800 3786 dst_entry = huge_ptep_get(dst_pte); 3787 + again: 3801 3788 if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { 3802 3789 /* 3803 3790 * Skip if src entry none. Also, skip in the ··· 3822 3807 } 3823 3808 set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); 3824 3809 } else { 3810 + entry = huge_ptep_get(src_pte); 3811 + ptepage = pte_page(entry); 3812 + get_page(ptepage); 3813 + 3814 + /* 3815 + * This is a rare case where we see pinned hugetlb 3816 + * pages while they're prone to COW. We need to do the 3817 + * COW earlier during fork. 3818 + * 3819 + * When pre-allocating the page or copying data, we 3820 + * need to be without the pgtable locks since we could 3821 + * sleep during the process. 3822 + */ 3823 + if (unlikely(page_needs_cow_for_dma(vma, ptepage))) { 3824 + pte_t src_pte_old = entry; 3825 + struct page *new; 3826 + 3827 + spin_unlock(src_ptl); 3828 + spin_unlock(dst_ptl); 3829 + /* Do not use reserve as it's private owned */ 3830 + new = alloc_huge_page(vma, addr, 1); 3831 + if (IS_ERR(new)) { 3832 + put_page(ptepage); 3833 + ret = PTR_ERR(new); 3834 + break; 3835 + } 3836 + copy_user_huge_page(new, ptepage, addr, vma, 3837 + npages); 3838 + put_page(ptepage); 3839 + 3840 + /* Install the new huge page if src pte stable */ 3841 + dst_ptl = huge_pte_lock(h, dst, dst_pte); 3842 + src_ptl = huge_pte_lockptr(h, src, src_pte); 3843 + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 3844 + entry = huge_ptep_get(src_pte); 3845 + if (!pte_same(src_pte_old, entry)) { 3846 + put_page(new); 3847 + /* dst_entry won't change as in child */ 3848 + goto again; 3849 + } 3850 + hugetlb_install_page(vma, dst_pte, addr, new); 3851 + spin_unlock(src_ptl); 3852 + spin_unlock(dst_ptl); 3853 + continue; 3854 + } 3855 + 3825 3856 if (cow) { 3826 3857 /* 3827 3858 * No need to notify as we are downgrading page ··· 3878 3817 */ 3879 3818 huge_ptep_set_wrprotect(src, addr, src_pte); 3880 3819 } 3881 - entry = huge_ptep_get(src_pte); 3882 - ptepage = pte_page(entry); 3883 - get_page(ptepage); 3820 + 3884 3821 page_dup_rmap(ptepage, true); 3885 3822 set_huge_pte_at(dst, addr, dst_pte, entry); 3886 - hugetlb_count_add(pages_per_huge_page(h), dst); 3823 + hugetlb_count_add(npages, dst); 3887 3824 } 3888 3825 spin_unlock(src_ptl); 3889 3826 spin_unlock(dst_ptl);
-5
mm/internal.h
··· 296 296 */ 297 297 #define buddy_order_unsafe(page) READ_ONCE(page_private(page)) 298 298 299 - static inline bool is_cow_mapping(vm_flags_t flags) 300 - { 301 - return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 302 - } 303 - 304 299 /* 305 300 * These three helpers classifies VMAs for virtual memory accounting. 306 301 */
+18 -12
mm/kfence/report.c
··· 20 20 21 21 #include "kfence.h" 22 22 23 + /* May be overridden by <asm/kfence.h>. */ 24 + #ifndef ARCH_FUNC_PREFIX 25 + #define ARCH_FUNC_PREFIX "" 26 + #endif 27 + 23 28 extern bool no_hash_pointers; 24 29 25 30 /* Helper function to either print to a seq_file or to console. */ ··· 72 67 for (skipnr = 0; skipnr < num_entries; skipnr++) { 73 68 int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]); 74 69 75 - if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") || 76 - !strncmp(buf, "__slab_free", len)) { 70 + if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfence_") || 71 + str_has_prefix(buf, ARCH_FUNC_PREFIX "__kfence_") || 72 + !strncmp(buf, ARCH_FUNC_PREFIX "__slab_free", len)) { 77 73 /* 78 74 * In case of tail calls from any of the below 79 75 * to any of the above. ··· 83 77 } 84 78 85 79 /* Also the *_bulk() variants by only checking prefixes. */ 86 - if (str_has_prefix(buf, "kfree") || 87 - str_has_prefix(buf, "kmem_cache_free") || 88 - str_has_prefix(buf, "__kmalloc") || 89 - str_has_prefix(buf, "kmem_cache_alloc")) 80 + if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") || 81 + str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") || 82 + str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") || 83 + str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc")) 90 84 goto found; 91 85 } 92 86 if (fallback < num_entries) ··· 122 116 lockdep_assert_held(&meta->lock); 123 117 124 118 if (meta->state == KFENCE_OBJECT_UNUSED) { 125 - seq_con_printf(seq, "kfence-#%zd unused\n", meta - kfence_metadata); 119 + seq_con_printf(seq, "kfence-#%td unused\n", meta - kfence_metadata); 126 120 return; 127 121 } 128 122 129 123 seq_con_printf(seq, 130 - "kfence-#%zd [0x%p-0x%p" 124 + "kfence-#%td [0x%p-0x%p" 131 125 ", size=%d, cache=%s] allocated by task %d:\n", 132 126 meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size, 133 127 (cache && cache->name) ? cache->name : "<destroyed>", meta->alloc_track.pid); ··· 210 204 211 205 pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write), 212 206 (void *)stack_entries[skipnr]); 213 - pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%zd):\n", 207 + pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%td):\n", 214 208 get_access_type(is_write), (void *)address, 215 209 left_of_object ? meta->addr - address : address - meta->addr, 216 210 left_of_object ? "left" : "right", object_index); ··· 219 213 case KFENCE_ERROR_UAF: 220 214 pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write), 221 215 (void *)stack_entries[skipnr]); 222 - pr_err("Use-after-free %s at 0x%p (in kfence-#%zd):\n", 216 + pr_err("Use-after-free %s at 0x%p (in kfence-#%td):\n", 223 217 get_access_type(is_write), (void *)address, object_index); 224 218 break; 225 219 case KFENCE_ERROR_CORRUPTION: 226 220 pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]); 227 221 pr_err("Corrupted memory at 0x%p ", (void *)address); 228 222 print_diff_canary(address, 16, meta); 229 - pr_cont(" (in kfence-#%zd):\n", object_index); 223 + pr_cont(" (in kfence-#%td):\n", object_index); 230 224 break; 231 225 case KFENCE_ERROR_INVALID: 232 226 pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write), ··· 236 230 break; 237 231 case KFENCE_ERROR_INVALID_FREE: 238 232 pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]); 239 - pr_err("Invalid free of 0x%p (in kfence-#%zd):\n", (void *)address, 233 + pr_err("Invalid free of 0x%p (in kfence-#%td):\n", (void *)address, 240 234 object_index); 241 235 break; 242 236 }
+12 -1
mm/madvise.c
··· 1198 1198 goto release_task; 1199 1199 } 1200 1200 1201 - mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); 1201 + /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ 1202 + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); 1202 1203 if (IS_ERR_OR_NULL(mm)) { 1203 1204 ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; 1204 1205 goto release_task; 1206 + } 1207 + 1208 + /* 1209 + * Require CAP_SYS_NICE for influencing process performance. Note that 1210 + * only non-destructive hints are currently supported. 1211 + */ 1212 + if (!capable(CAP_SYS_NICE)) { 1213 + ret = -EPERM; 1214 + goto release_mm; 1205 1215 } 1206 1216 1207 1217 total_len = iov_iter_count(&iter); ··· 1228 1218 if (ret == 0) 1229 1219 ret = total_len - iov_iter_count(&iter); 1230 1220 1221 + release_mm: 1231 1222 mmput(mm); 1232 1223 release_task: 1233 1224 put_task_struct(task);
+6 -9
mm/memcontrol.c
··· 3287 3287 3288 3288 #endif /* CONFIG_MEMCG_KMEM */ 3289 3289 3290 - #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3291 3290 /* 3292 - * Because page_memcg(head) is not set on compound tails, set it now. 3291 + * Because page_memcg(head) is not set on tails, set it now. 3293 3292 */ 3294 - void mem_cgroup_split_huge_fixup(struct page *head) 3293 + void split_page_memcg(struct page *head, unsigned int nr) 3295 3294 { 3296 3295 struct mem_cgroup *memcg = page_memcg(head); 3297 3296 int i; 3298 3297 3299 - if (mem_cgroup_disabled()) 3298 + if (mem_cgroup_disabled() || !memcg) 3300 3299 return; 3301 3300 3302 - for (i = 1; i < HPAGE_PMD_NR; i++) { 3303 - css_get(&memcg->css); 3304 - head[i].memcg_data = (unsigned long)memcg; 3305 - } 3301 + for (i = 1; i < nr; i++) 3302 + head[i].memcg_data = head->memcg_data; 3303 + css_get_many(&memcg->css, nr - 1); 3306 3304 } 3307 - #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3308 3305 3309 3306 #ifdef CONFIG_MEMCG_SWAP 3310 3307 /**
+9 -7
mm/memory.c
··· 809 809 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, 810 810 struct page **prealloc, pte_t pte, struct page *page) 811 811 { 812 - struct mm_struct *src_mm = src_vma->vm_mm; 813 812 struct page *new_page; 814 - 815 - if (!is_cow_mapping(src_vma->vm_flags)) 816 - return 1; 817 813 818 814 /* 819 815 * What we want to do is to check whether this page may ··· 824 828 * the page count. That might give false positives for 825 829 * for pinning, but it will work correctly. 826 830 */ 827 - if (likely(!atomic_read(&src_mm->has_pinned))) 828 - return 1; 829 - if (likely(!page_maybe_dma_pinned(page))) 831 + if (likely(!page_needs_cow_for_dma(src_vma, page))) 830 832 return 1; 831 833 832 834 new_page = *prealloc; ··· 3096 3102 pte_unmap_unlock(vmf->pte, vmf->ptl); 3097 3103 return handle_userfault(vmf, VM_UFFD_WP); 3098 3104 } 3105 + 3106 + /* 3107 + * Userfaultfd write-protect can defer flushes. Ensure the TLB 3108 + * is flushed in this case before copying. 3109 + */ 3110 + if (unlikely(userfaultfd_wp(vmf->vma) && 3111 + mm_tlb_flush_pending(vmf->vma->vm_mm))) 3112 + flush_tlb_page(vmf->vma, vmf->address); 3099 3113 3100 3114 vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); 3101 3115 if (!vmf->page) {
+82 -85
mm/page_alloc.c
··· 1282 1282 kernel_poison_pages(page, 1 << order); 1283 1283 1284 1284 /* 1285 + * With hardware tag-based KASAN, memory tags must be set before the 1286 + * page becomes unavailable via debug_pagealloc or arch_free_page. 1287 + */ 1288 + kasan_free_nondeferred_pages(page, order); 1289 + 1290 + /* 1285 1291 * arch_free_page() can make the page's contents inaccessible. s390 1286 1292 * does this. So nothing which can access the page's contents should 1287 1293 * happen after this. ··· 1295 1289 arch_free_page(page, order); 1296 1290 1297 1291 debug_pagealloc_unmap_pages(page, 1 << order); 1298 - 1299 - kasan_free_nondeferred_pages(page, order); 1300 1292 1301 1293 return true; 1302 1294 } ··· 3314 3310 for (i = 1; i < (1 << order); i++) 3315 3311 set_page_refcounted(page + i); 3316 3312 split_page_owner(page, 1 << order); 3313 + split_page_memcg(page, 1 << order); 3317 3314 } 3318 3315 EXPORT_SYMBOL_GPL(split_page); 3319 3316 ··· 6264 6259 } 6265 6260 } 6266 6261 6262 + #if !defined(CONFIG_FLAT_NODE_MEM_MAP) 6263 + /* 6264 + * Only struct pages that correspond to ranges defined by memblock.memory 6265 + * are zeroed and initialized by going through __init_single_page() during 6266 + * memmap_init_zone(). 6267 + * 6268 + * But, there could be struct pages that correspond to holes in 6269 + * memblock.memory. This can happen because of the following reasons: 6270 + * - physical memory bank size is not necessarily the exact multiple of the 6271 + * arbitrary section size 6272 + * - early reserved memory may not be listed in memblock.memory 6273 + * - memory layouts defined with memmap= kernel parameter may not align 6274 + * nicely with memmap sections 6275 + * 6276 + * Explicitly initialize those struct pages so that: 6277 + * - PG_Reserved is set 6278 + * - zone and node links point to zone and node that span the page if the 6279 + * hole is in the middle of a zone 6280 + * - zone and node links point to adjacent zone/node if the hole falls on 6281 + * the zone boundary; the pages in such holes will be prepended to the 6282 + * zone/node above the hole except for the trailing pages in the last 6283 + * section that will be appended to the zone/node below. 6284 + */ 6285 + static u64 __meminit init_unavailable_range(unsigned long spfn, 6286 + unsigned long epfn, 6287 + int zone, int node) 6288 + { 6289 + unsigned long pfn; 6290 + u64 pgcnt = 0; 6291 + 6292 + for (pfn = spfn; pfn < epfn; pfn++) { 6293 + if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { 6294 + pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) 6295 + + pageblock_nr_pages - 1; 6296 + continue; 6297 + } 6298 + __init_single_page(pfn_to_page(pfn), pfn, zone, node); 6299 + __SetPageReserved(pfn_to_page(pfn)); 6300 + pgcnt++; 6301 + } 6302 + 6303 + return pgcnt; 6304 + } 6305 + #else 6306 + static inline u64 init_unavailable_range(unsigned long spfn, unsigned long epfn, 6307 + int zone, int node) 6308 + { 6309 + return 0; 6310 + } 6311 + #endif 6312 + 6267 6313 void __meminit __weak memmap_init_zone(struct zone *zone) 6268 6314 { 6269 6315 unsigned long zone_start_pfn = zone->zone_start_pfn; 6270 6316 unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; 6271 6317 int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone); 6318 + static unsigned long hole_pfn; 6272 6319 unsigned long start_pfn, end_pfn; 6320 + u64 pgcnt = 0; 6273 6321 6274 6322 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 6275 6323 start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); ··· 6332 6274 memmap_init_range(end_pfn - start_pfn, nid, 6333 6275 zone_id, start_pfn, zone_end_pfn, 6334 6276 MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); 6277 + 6278 + if (hole_pfn < start_pfn) 6279 + pgcnt += init_unavailable_range(hole_pfn, start_pfn, 6280 + zone_id, nid); 6281 + hole_pfn = end_pfn; 6335 6282 } 6283 + 6284 + #ifdef CONFIG_SPARSEMEM 6285 + /* 6286 + * Initialize the hole in the range [zone_end_pfn, section_end]. 6287 + * If zone boundary falls in the middle of a section, this hole 6288 + * will be re-initialized during the call to this function for the 6289 + * higher zone. 6290 + */ 6291 + end_pfn = round_up(zone_end_pfn, PAGES_PER_SECTION); 6292 + if (hole_pfn < end_pfn) 6293 + pgcnt += init_unavailable_range(hole_pfn, end_pfn, 6294 + zone_id, nid); 6295 + #endif 6296 + 6297 + if (pgcnt) 6298 + pr_info(" %s zone: %llu pages in unavailable ranges\n", 6299 + zone->name, pgcnt); 6336 6300 } 6337 6301 6338 6302 static int zone_batchsize(struct zone *zone) ··· 7151 7071 free_area_init_node(nid); 7152 7072 } 7153 7073 7154 - #if !defined(CONFIG_FLAT_NODE_MEM_MAP) 7155 - /* 7156 - * Initialize all valid struct pages in the range [spfn, epfn) and mark them 7157 - * PageReserved(). Return the number of struct pages that were initialized. 7158 - */ 7159 - static u64 __init init_unavailable_range(unsigned long spfn, unsigned long epfn) 7160 - { 7161 - unsigned long pfn; 7162 - u64 pgcnt = 0; 7163 - 7164 - for (pfn = spfn; pfn < epfn; pfn++) { 7165 - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { 7166 - pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) 7167 - + pageblock_nr_pages - 1; 7168 - continue; 7169 - } 7170 - /* 7171 - * Use a fake node/zone (0) for now. Some of these pages 7172 - * (in memblock.reserved but not in memblock.memory) will 7173 - * get re-initialized via reserve_bootmem_region() later. 7174 - */ 7175 - __init_single_page(pfn_to_page(pfn), pfn, 0, 0); 7176 - __SetPageReserved(pfn_to_page(pfn)); 7177 - pgcnt++; 7178 - } 7179 - 7180 - return pgcnt; 7181 - } 7182 - 7183 - /* 7184 - * Only struct pages that are backed by physical memory are zeroed and 7185 - * initialized by going through __init_single_page(). But, there are some 7186 - * struct pages which are reserved in memblock allocator and their fields 7187 - * may be accessed (for example page_to_pfn() on some configuration accesses 7188 - * flags). We must explicitly initialize those struct pages. 7189 - * 7190 - * This function also addresses a similar issue where struct pages are left 7191 - * uninitialized because the physical address range is not covered by 7192 - * memblock.memory or memblock.reserved. That could happen when memblock 7193 - * layout is manually configured via memmap=, or when the highest physical 7194 - * address (max_pfn) does not end on a section boundary. 7195 - */ 7196 - static void __init init_unavailable_mem(void) 7197 - { 7198 - phys_addr_t start, end; 7199 - u64 i, pgcnt; 7200 - phys_addr_t next = 0; 7201 - 7202 - /* 7203 - * Loop through unavailable ranges not covered by memblock.memory. 7204 - */ 7205 - pgcnt = 0; 7206 - for_each_mem_range(i, &start, &end) { 7207 - if (next < start) 7208 - pgcnt += init_unavailable_range(PFN_DOWN(next), 7209 - PFN_UP(start)); 7210 - next = end; 7211 - } 7212 - 7213 - /* 7214 - * Early sections always have a fully populated memmap for the whole 7215 - * section - see pfn_valid(). If the last section has holes at the 7216 - * end and that section is marked "online", the memmap will be 7217 - * considered initialized. Make sure that memmap has a well defined 7218 - * state. 7219 - */ 7220 - pgcnt += init_unavailable_range(PFN_DOWN(next), 7221 - round_up(max_pfn, PAGES_PER_SECTION)); 7222 - 7223 - /* 7224 - * Struct pages that do not have backing memory. This could be because 7225 - * firmware is using some of this memory, or for some other reasons. 7226 - */ 7227 - if (pgcnt) 7228 - pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt); 7229 - } 7230 - #else 7231 - static inline void __init init_unavailable_mem(void) 7232 - { 7233 - } 7234 - #endif /* !CONFIG_FLAT_NODE_MEM_MAP */ 7235 - 7236 7074 #if MAX_NUMNODES > 1 7237 7075 /* 7238 7076 * Figure out the number of possible node ids. ··· 7574 7576 /* Initialise every node */ 7575 7577 mminit_verify_pageflags_layout(); 7576 7578 setup_nr_node_ids(); 7577 - init_unavailable_mem(); 7578 7579 for_each_online_node(nid) { 7579 7580 pg_data_t *pgdat = NODE_DATA(nid); 7580 7581 free_area_init_node(nid);
+1 -1
mm/slab.c
··· 2992 2992 gfp_t flags, void *objp, unsigned long caller) 2993 2993 { 2994 2994 WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); 2995 - if (!objp) 2995 + if (!objp || is_kfence_address(objp)) 2996 2996 return objp; 2997 2997 if (cachep->flags & SLAB_POISON) { 2998 2998 check_poison_obj(cachep, objp);