Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

Currently, scan_get_next_rmap_item() walks every page address in a VMA to
locate mergeable pages. This becomes highly inefficient when scanning
large virtual memory areas that contain mostly unmapped regions, causing
ksmd to use large amount of cpu without deduplicating much pages.

This patch replaces the per-address lookup with a range walk using
walk_page_range(). The range walker allows KSM to skip over entire
unmapped holes in a VMA, avoiding unnecessary lookups. This problem was
previously discussed in [1].

Consider the following test program which creates a 32 TiB mapping in the
virtual address space but only populates a single page:

#include <unistd.h>
#include <stdio.h>
#include <sys/mman.h>

/* 32 TiB */
const size_t size = 32ul * 1024 * 1024 * 1024 * 1024;

int main() {
char *area = mmap(NULL, size, PROT_READ | PROT_WRITE,
MAP_NORESERVE | MAP_PRIVATE | MAP_ANON, -1, 0);

if (area == MAP_FAILED) {
perror("mmap() failed\n");
return -1;
}

/* Populate a single page such that we get an anon_vma. */
*area = 0;

/* Enable KSM. */
madvise(area, size, MADV_MERGEABLE);
pause();
return 0;
}

$ ./ksm-sparse &
$ echo 1 > /sys/kernel/mm/ksm/run

Without this patch ksmd uses 100% of the cpu for a long time (more then 1
hour in my test machine) scanning all the 32 TiB virtual address space
that contain only one mapped page. This makes ksmd essentially deadlocked
not able to deduplicate anything of value. With this patch ksmd walks
only the one mapped page and skips the rest of the 32 TiB virtual address
space, making the scan fast using little cpu.

Link: https://lkml.kernel.org/r/20251023035841.41406-1-pedrodemargomes@gmail.com
Link: https://lkml.kernel.org/r/20251022153059.22763-1-pedrodemargomes@gmail.com
Link: https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/ [1]
Fixes: 31dbd01f3143 ("ksm: Kernel SamePage Merging")
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
Co-developed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Reported-by: craftfever <craftfever@airmail.cc>
Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Pedro Demarchi Gomes and committed by
Andrew Morton
f5548c31 7e76b75e

+104 -9
+104 -9
mm/ksm.c
··· 2455 2455 return true; 2456 2456 } 2457 2457 2458 + struct ksm_next_page_arg { 2459 + struct folio *folio; 2460 + struct page *page; 2461 + unsigned long addr; 2462 + }; 2463 + 2464 + static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end, 2465 + struct mm_walk *walk) 2466 + { 2467 + struct ksm_next_page_arg *private = walk->private; 2468 + struct vm_area_struct *vma = walk->vma; 2469 + pte_t *start_ptep = NULL, *ptep, pte; 2470 + struct mm_struct *mm = walk->mm; 2471 + struct folio *folio; 2472 + struct page *page; 2473 + spinlock_t *ptl; 2474 + pmd_t pmd; 2475 + 2476 + if (ksm_test_exit(mm)) 2477 + return 0; 2478 + 2479 + cond_resched(); 2480 + 2481 + pmd = pmdp_get_lockless(pmdp); 2482 + if (!pmd_present(pmd)) 2483 + return 0; 2484 + 2485 + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) { 2486 + ptl = pmd_lock(mm, pmdp); 2487 + pmd = pmdp_get(pmdp); 2488 + 2489 + if (!pmd_present(pmd)) { 2490 + goto not_found_unlock; 2491 + } else if (pmd_leaf(pmd)) { 2492 + page = vm_normal_page_pmd(vma, addr, pmd); 2493 + if (!page) 2494 + goto not_found_unlock; 2495 + folio = page_folio(page); 2496 + 2497 + if (folio_is_zone_device(folio) || !folio_test_anon(folio)) 2498 + goto not_found_unlock; 2499 + 2500 + page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT); 2501 + goto found_unlock; 2502 + } 2503 + spin_unlock(ptl); 2504 + } 2505 + 2506 + start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 2507 + if (!start_ptep) 2508 + return 0; 2509 + 2510 + for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) { 2511 + pte = ptep_get(ptep); 2512 + 2513 + if (!pte_present(pte)) 2514 + continue; 2515 + 2516 + page = vm_normal_page(vma, addr, pte); 2517 + if (!page) 2518 + continue; 2519 + folio = page_folio(page); 2520 + 2521 + if (folio_is_zone_device(folio) || !folio_test_anon(folio)) 2522 + continue; 2523 + goto found_unlock; 2524 + } 2525 + 2526 + not_found_unlock: 2527 + spin_unlock(ptl); 2528 + if (start_ptep) 2529 + pte_unmap(start_ptep); 2530 + return 0; 2531 + found_unlock: 2532 + folio_get(folio); 2533 + spin_unlock(ptl); 2534 + if (start_ptep) 2535 + pte_unmap(start_ptep); 2536 + private->page = page; 2537 + private->folio = folio; 2538 + private->addr = addr; 2539 + return 1; 2540 + } 2541 + 2542 + static struct mm_walk_ops ksm_next_page_ops = { 2543 + .pmd_entry = ksm_next_page_pmd_entry, 2544 + .walk_lock = PGWALK_RDLOCK, 2545 + }; 2546 + 2458 2547 static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) 2459 2548 { 2460 2549 struct mm_struct *mm; ··· 2631 2542 ksm_scan.address = vma->vm_end; 2632 2543 2633 2544 while (ksm_scan.address < vma->vm_end) { 2545 + struct ksm_next_page_arg ksm_next_page_arg; 2634 2546 struct page *tmp_page = NULL; 2635 - struct folio_walk fw; 2636 2547 struct folio *folio; 2637 2548 2638 2549 if (ksm_test_exit(mm)) 2639 2550 break; 2640 2551 2641 - folio = folio_walk_start(&fw, vma, ksm_scan.address, 0); 2642 - if (folio) { 2643 - if (!folio_is_zone_device(folio) && 2644 - folio_test_anon(folio)) { 2645 - folio_get(folio); 2646 - tmp_page = fw.page; 2647 - } 2648 - folio_walk_end(&fw, vma); 2552 + int found; 2553 + 2554 + found = walk_page_range_vma(vma, ksm_scan.address, 2555 + vma->vm_end, 2556 + &ksm_next_page_ops, 2557 + &ksm_next_page_arg); 2558 + 2559 + if (found > 0) { 2560 + folio = ksm_next_page_arg.folio; 2561 + tmp_page = ksm_next_page_arg.page; 2562 + ksm_scan.address = ksm_next_page_arg.addr; 2563 + } else { 2564 + VM_WARN_ON_ONCE(found < 0); 2565 + ksm_scan.address = vma->vm_end - PAGE_SIZE; 2649 2566 } 2650 2567 2651 2568 if (tmp_page) {