kernel/events/uprobes: uprobe_write_opcode() rewrite

uprobe_write_opcode() does some pretty low-level things that really, it
shouldn't be doing: for example, manually breaking COW by allocating
anonymous folios and replacing mapped pages.

Further, it does seem to do some shaky things: for example, writing to
possible COW-shared anonymous pages or zapping anonymous pages that might
be pinned. We're also not taking care of uffd, uffd-wp, softdirty ...
although rather corner cases here. Let's just get it right like ordinary
ptrace writes would.

Let's rewrite the code, leaving COW-breaking to core-MM, triggered by
FOLL_FORCE|FOLL_WRITE (note that the code was already using FOLL_FORCE).

We'll use GUP to lookup/faultin the page and break COW if required. Then,
we'll walk the page tables using a folio_walk to perform our page
modification atomically by temporarily unmap the PTE + flushing the TLB.

Likely, we could avoid the temporary unmap in case we can just atomically
write the instruction, but that will be a separate project.

Unfortunately, we still have to implement the zapping logic manually,
because we only want to zap in specific circumstances (e.g., page content
identical).

Note that we can now handle large folios (compound pages) and the shared
zeropage just fine, so drop these checks.

Link: https://lkml.kernel.org/r/20250321113713.204682-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <olsajiri@gmail.com>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Namhyung kim <namhyung@kernel.org>
Cc: Russel King <linux@armlinux.org.uk>
Cc: tongtiangen <tongtiangen@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

David Hildenbrand and committed by

Andrew Morton 1 year ago 6e3092d7 8a557742

+160 -156

1 changed file

expand all

kernel

events

uprobes.c

+160 -156

kernel/events/uprobes.c

··· 29 29 #include <linux/workqueue.h> 30 30 #include <linux/srcu.h> 31 31 #include <linux/oom.h> /* check_stable_address_space */ 32 + #include <linux/pagewalk.h> 32 33 33 34 #include <linux/uprobes.h> 34 35 ··· 150 149 static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) 151 150 { 152 151 return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start); 153 - } 154 - 155 - /** 156 - * __replace_page - replace page in vma by new page. 157 - * based on replace_page in mm/ksm.c 158 - * 159 - * @vma: vma that holds the pte pointing to page 160 - * @addr: address the old @page is mapped at 161 - * @old_page: the page we are replacing by new_page 162 - * @new_page: the modified page we replace page by 163 - * 164 - * If @new_page is NULL, only unmap @old_page. 165 - * 166 - * Returns 0 on success, negative error code otherwise. 167 - */ 168 - static int __replace_page(struct vm_area_struct *vma, unsigned long addr, 169 - struct page *old_page, struct page *new_page) 170 - { 171 - struct folio *old_folio = page_folio(old_page); 172 - struct folio *new_folio; 173 - struct mm_struct *mm = vma->vm_mm; 174 - DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); 175 - int err; 176 - struct mmu_notifier_range range; 177 - pte_t pte; 178 - 179 - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, 180 - addr + PAGE_SIZE); 181 - 182 - if (new_page) { 183 - new_folio = page_folio(new_page); 184 - err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL); 185 - if (err) 186 - return err; 187 - } 188 - 189 - /* For folio_free_swap() below */ 190 - folio_lock(old_folio); 191 - 192 - mmu_notifier_invalidate_range_start(&range); 193 - err = -EAGAIN; 194 - if (!page_vma_mapped_walk(&pvmw)) 195 - goto unlock; 196 - VM_BUG_ON_PAGE(addr != pvmw.address, old_page); 197 - pte = ptep_get(pvmw.pte); 198 - 199 - /* 200 - * Handle PFN swap PTES, such as device-exclusive ones, that actually 201 - * map pages: simply trigger GUP again to fix it up. 202 - */ 203 - if (unlikely(!pte_present(pte))) { 204 - page_vma_mapped_walk_done(&pvmw); 205 - goto unlock; 206 - } 207 - 208 - if (new_page) { 209 - folio_get(new_folio); 210 - folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE); 211 - folio_add_lru_vma(new_folio, vma); 212 - } else 213 - /* no new page, just dec_mm_counter for old_page */ 214 - dec_mm_counter(mm, MM_ANONPAGES); 215 - 216 - if (!folio_test_anon(old_folio)) { 217 - dec_mm_counter(mm, mm_counter_file(old_folio)); 218 - inc_mm_counter(mm, MM_ANONPAGES); 219 - } 220 - 221 - flush_cache_page(vma, addr, pte_pfn(pte)); 222 - ptep_clear_flush(vma, addr, pvmw.pte); 223 - if (new_page) 224 - set_pte_at(mm, addr, pvmw.pte, 225 - mk_pte(new_page, vma->vm_page_prot)); 226 - 227 - folio_remove_rmap_pte(old_folio, old_page, vma); 228 - if (!folio_mapped(old_folio)) 229 - folio_free_swap(old_folio); 230 - page_vma_mapped_walk_done(&pvmw); 231 - folio_put(old_folio); 232 - 233 - err = 0; 234 - unlock: 235 - mmu_notifier_invalidate_range_end(&range); 236 - folio_unlock(old_folio); 237 - return err; 238 152 } 239 153 240 154 /** ··· 379 463 return ret; 380 464 } 381 465 466 + static bool orig_page_is_identical(struct vm_area_struct *vma, 467 + unsigned long vaddr, struct page *page, bool *pmd_mappable) 468 + { 469 + const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT; 470 + struct folio *orig_folio = filemap_get_folio(vma->vm_file->f_mapping, 471 + index); 472 + struct page *orig_page; 473 + bool identical; 474 + 475 + if (IS_ERR(orig_folio)) 476 + return false; 477 + orig_page = folio_file_page(orig_folio, index); 478 + 479 + *pmd_mappable = folio_test_pmd_mappable(orig_folio); 480 + identical = folio_test_uptodate(orig_folio) && 481 + pages_identical(page, orig_page); 482 + folio_put(orig_folio); 483 + return identical; 484 + } 485 + 486 + static int __uprobe_write_opcode(struct vm_area_struct *vma, 487 + struct folio_walk *fw, struct folio *folio, 488 + unsigned long opcode_vaddr, uprobe_opcode_t opcode) 489 + { 490 + const unsigned long vaddr = opcode_vaddr & PAGE_MASK; 491 + const bool is_register = !!is_swbp_insn(&opcode); 492 + bool pmd_mappable; 493 + 494 + /* For now, we'll only handle PTE-mapped folios. */ 495 + if (fw->level != FW_LEVEL_PTE) 496 + return -EFAULT; 497 + 498 + /* 499 + * See can_follow_write_pte(): we'd actually prefer a writable PTE here, 500 + * but the VMA might not be writable. 501 + */ 502 + if (!pte_write(fw->pte)) { 503 + if (!PageAnonExclusive(fw->page)) 504 + return -EFAULT; 505 + if (unlikely(userfaultfd_pte_wp(vma, fw->pte))) 506 + return -EFAULT; 507 + /* SOFTDIRTY is handled via pte_mkdirty() below. */ 508 + } 509 + 510 + /* 511 + * We'll temporarily unmap the page and flush the TLB, such that we can 512 + * modify the page atomically. 513 + */ 514 + flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); 515 + fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); 516 + copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); 517 + 518 + /* 519 + * When unregistering, we may only zap a PTE if uffd is disabled and 520 + * there are no unexpected folio references ... 521 + */ 522 + if (is_register || userfaultfd_missing(vma) || 523 + (folio_ref_count(folio) != folio_mapcount(folio) + 1 + 524 + folio_test_swapcache(folio) * folio_nr_pages(folio))) 525 + goto remap; 526 + 527 + /* 528 + * ... and the mapped page is identical to the original page that 529 + * would get faulted in on next access. 530 + */ 531 + if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable)) 532 + goto remap; 533 + 534 + dec_mm_counter(vma->vm_mm, MM_ANONPAGES); 535 + folio_remove_rmap_pte(folio, fw->page, vma); 536 + if (!folio_mapped(folio) && folio_test_swapcache(folio) && 537 + folio_trylock(folio)) { 538 + folio_free_swap(folio); 539 + folio_unlock(folio); 540 + } 541 + folio_put(folio); 542 + 543 + return pmd_mappable; 544 + remap: 545 + /* 546 + * Make sure that our copy_to_page() changes become visible before the 547 + * set_pte_at() write. 548 + */ 549 + smp_wmb(); 550 + /* We modified the page. Make sure to mark the PTE dirty. */ 551 + set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte)); 552 + return 0; 553 + } 554 + 382 555 /* 383 556 * NOTE: 384 557 * Expect the breakpoint instruction to be the smallest size instruction for ··· 480 475 * uprobe_write_opcode - write the opcode at a given virtual address. 481 476 * @auprobe: arch specific probepoint information. 482 477 * @vma: the probed virtual memory area. 483 - * @vaddr: the virtual address to store the opcode. 484 - * @opcode: opcode to be written at @vaddr. 478 + * @opcode_vaddr: the virtual address to store the opcode. 479 + * @opcode: opcode to be written at @opcode_vaddr. 485 480 * 486 481 * Called with mm->mmap_lock held for read or write. 487 482 * Return 0 (success) or a negative errno. 488 483 */ 489 484 int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, 490 - unsigned long vaddr, uprobe_opcode_t opcode) 485 + const unsigned long opcode_vaddr, uprobe_opcode_t opcode) 491 486 { 487 + const unsigned long vaddr = opcode_vaddr & PAGE_MASK; 492 488 struct mm_struct *mm = vma->vm_mm; 493 489 struct uprobe *uprobe; 494 - struct page *old_page, *new_page; 495 490 int ret, is_register, ref_ctr_updated = 0; 496 - bool orig_page_huge = false; 497 491 unsigned int gup_flags = FOLL_FORCE; 492 + struct mmu_notifier_range range; 493 + struct folio_walk fw; 494 + struct folio *folio; 495 + struct page *page; 498 496 499 497 is_register = is_swbp_insn(&opcode); 500 498 uprobe = container_of(auprobe, struct uprobe, arch); 501 499 502 - retry: 500 + if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) 501 + return -EINVAL; 502 + 503 + /* 504 + * When registering, we have to break COW to get an exclusive anonymous 505 + * page that we can safely modify. Use FOLL_WRITE to trigger a write 506 + * fault if required. When unregistering, we might be lucky and the 507 + * anon page is already gone. So defer write faults until really 508 + * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode() 509 + * cannot deal with PMDs yet. 510 + */ 503 511 if (is_register) 504 - gup_flags |= FOLL_SPLIT_PMD; 505 - /* Read the page with vaddr into memory */ 506 - ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &old_page, NULL); 507 - if (ret != 1) 508 - return ret; 512 + gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; 509 513 510 - ret = verify_opcode(old_page, vaddr, &opcode); 514 + retry: 515 + ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL); 511 516 if (ret <= 0) 512 - goto put_old; 517 + goto out; 518 + folio = page_folio(page); 513 519 514 - if (is_zero_page(old_page)) { 515 - ret = -EINVAL; 516 - goto put_old; 517 - } 518 - 519 - if (WARN(!is_register && PageCompound(old_page), 520 - "uprobe unregister should never work on compound page\n")) { 521 - ret = -EINVAL; 522 - goto put_old; 520 + ret = verify_opcode(page, opcode_vaddr, &opcode); 521 + if (ret <= 0) { 522 + folio_put(folio); 523 + goto out; 523 524 } 524 525 525 526 /* We are going to replace instruction, update ref_ctr. */ 526 527 if (!ref_ctr_updated && uprobe->ref_ctr_offset) { 527 528 ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); 528 - if (ret) 529 - goto put_old; 529 + if (ret) { 530 + folio_put(folio); 531 + goto out; 532 + } 530 533 531 534 ref_ctr_updated = 1; 532 535 } 533 536 534 537 ret = 0; 535 - if (!is_register && !PageAnon(old_page)) 536 - goto put_old; 537 - 538 - ret = anon_vma_prepare(vma); 539 - if (ret) 540 - goto put_old; 541 - 542 - ret = -ENOMEM; 543 - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); 544 - if (!new_page) 545 - goto put_old; 546 - 547 - __SetPageUptodate(new_page); 548 - copy_highpage(new_page, old_page); 549 - copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); 550 - 551 - if (!is_register) { 552 - struct page *orig_page; 553 - pgoff_t index; 554 - 555 - VM_BUG_ON_PAGE(!PageAnon(old_page), old_page); 556 - 557 - index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT; 558 - orig_page = find_get_page(vma->vm_file->f_inode->i_mapping, 559 - index); 560 - 561 - if (orig_page) { 562 - if (PageUptodate(orig_page) && 563 - pages_identical(new_page, orig_page)) { 564 - /* let go new_page */ 565 - put_page(new_page); 566 - new_page = NULL; 567 - 568 - if (PageCompound(orig_page)) 569 - orig_page_huge = true; 570 - } 571 - put_page(orig_page); 572 - } 538 + if (unlikely(!folio_test_anon(folio))) { 539 + VM_WARN_ON_ONCE(is_register); 540 + folio_put(folio); 541 + goto out; 573 542 } 574 543 575 - ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page); 576 - if (new_page) 577 - put_page(new_page); 578 - put_old: 579 - put_page(old_page); 544 + if (!is_register) { 545 + /* 546 + * In the common case, we'll be able to zap the page when 547 + * unregistering. So trigger MMU notifiers now, as we won't 548 + * be able to do it under PTL. 549 + */ 550 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 551 + vaddr, vaddr + PAGE_SIZE); 552 + mmu_notifier_invalidate_range_start(&range); 553 + } 580 554 581 - if (unlikely(ret == -EAGAIN)) 555 + ret = -EAGAIN; 556 + /* Walk the page tables again, to perform the actual update. */ 557 + if (folio_walk_start(&fw, vma, vaddr, 0)) { 558 + if (fw.page == page) 559 + ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode); 560 + folio_walk_end(&fw, vma); 561 + } 562 + 563 + if (!is_register) 564 + mmu_notifier_invalidate_range_end(&range); 565 + 566 + folio_put(folio); 567 + switch (ret) { 568 + case -EFAULT: 569 + gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; 570 + fallthrough; 571 + case -EAGAIN: 582 572 goto retry; 573 + default: 574 + break; 575 + } 583 576 577 + out: 584 578 /* Revert back reference counter if instruction update failed. */ 585 - if (ret && is_register && ref_ctr_updated) 579 + if (ret < 0 && is_register && ref_ctr_updated) 586 580 update_ref_ctr(uprobe, mm, -1); 587 581 588 582 /* try collapse pmd for compound page */ 589 - if (!ret && orig_page_huge) 583 + if (ret > 0) 590 584 collapse_pte_mapped_thp(mm, vaddr, false); 591 585 592 - return ret; 586 + return ret < 0 ? ret : 0; 593 587 } 594 588 595 589 /**

Configure Feed

Configure Feed