memcg-v1: remove charge move code · tjh.dev/kernel@6b61138

-5

include/linux/memcontrol.h

··· 299 299 /* For oom notifier event fd */ 300 300 struct list_head oom_notify; 301 301 302 - /* 303 - * Should we move charges of a task when a task is moved into this 304 - * mem_cgroup ? And what type of charges should we move ? 305 - */ 306 - unsigned long move_charge_at_immigrate; 307 302 /* taken only while moving_account > 0 */ 308 303 spinlock_t move_lock; 309 304 unsigned long move_lock_flags;

-887

mm/memcontrol-v1.c

··· 40 40 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 41 41 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 42 42 43 - /* Stuffs for move charges at task migration. */ 44 - /* 45 - * Types of charges to be moved. 46 - */ 47 - #define MOVE_ANON 0x1ULL 48 - #define MOVE_FILE 0x2ULL 49 - #define MOVE_MASK (MOVE_ANON | MOVE_FILE) 50 - 51 - /* "mc" and its members are protected by cgroup_mutex */ 52 - static struct move_charge_struct { 53 - spinlock_t lock; /* for from, to */ 54 - struct mm_struct *mm; 55 - struct mem_cgroup *from; 56 - struct mem_cgroup *to; 57 - unsigned long flags; 58 - unsigned long precharge; 59 - unsigned long moved_charge; 60 - unsigned long moved_swap; 61 - struct task_struct *moving_task; /* a task moving charges */ 62 - wait_queue_head_t waitq; /* a waitq for other context */ 63 - } mc = { 64 - .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 65 - .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 66 - }; 67 - 68 43 /* for OOM */ 69 44 struct mem_cgroup_eventfd_list { 70 45 struct list_head list; ··· 401 426 return nr_reclaimed; 402 427 } 403 428 404 - /* 405 - * A routine for checking "mem" is under move_account() or not. 406 - * 407 - * Checking a cgroup is mc.from or mc.to or under hierarchy of 408 - * moving cgroups. This is for waiting at high-memory pressure 409 - * caused by "move". 410 - */ 411 - static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 412 - { 413 - struct mem_cgroup *from; 414 - struct mem_cgroup *to; 415 - bool ret = false; 416 - /* 417 - * Unlike task_move routines, we access mc.to, mc.from not under 418 - * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 419 - */ 420 - spin_lock(&mc.lock); 421 - from = mc.from; 422 - to = mc.to; 423 - if (!from) 424 - goto unlock; 425 - 426 - ret = mem_cgroup_is_descendant(from, memcg) || 427 - mem_cgroup_is_descendant(to, memcg); 428 - unlock: 429 - spin_unlock(&mc.lock); 430 - return ret; 431 - } 432 - 433 - bool memcg1_wait_acct_move(struct mem_cgroup *memcg) 434 - { 435 - if (mc.moving_task && current != mc.moving_task) { 436 - if (mem_cgroup_under_move(memcg)) { 437 - DEFINE_WAIT(wait); 438 - prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 439 - /* moving charge context might have finished. */ 440 - if (mc.moving_task) 441 - schedule(); 442 - finish_wait(&mc.waitq, &wait); 443 - return true; 444 - } 445 - } 446 - return false; 447 - } 448 - 449 429 /** 450 430 * folio_memcg_lock - Bind a folio to its memcg. 451 431 * @folio: The folio. ··· 482 552 __folio_memcg_unlock(folio_memcg(folio)); 483 553 } 484 554 485 - #ifdef CONFIG_SWAP 486 - /** 487 - * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 488 - * @entry: swap entry to be moved 489 - * @from: mem_cgroup which the entry is moved from 490 - * @to: mem_cgroup which the entry is moved to 491 - * 492 - * It succeeds only when the swap_cgroup's record for this entry is the same 493 - * as the mem_cgroup's id of @from. 494 - * 495 - * Returns 0 on success, -EINVAL on failure. 496 - * 497 - * The caller must have charged to @to, IOW, called page_counter_charge() about 498 - * both res and memsw, and called css_get(). 499 - */ 500 - static int mem_cgroup_move_swap_account(swp_entry_t entry, 501 - struct mem_cgroup *from, struct mem_cgroup *to) 502 - { 503 - unsigned short old_id, new_id; 504 - 505 - old_id = mem_cgroup_id(from); 506 - new_id = mem_cgroup_id(to); 507 - 508 - if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 509 - mod_memcg_state(from, MEMCG_SWAP, -1); 510 - mod_memcg_state(to, MEMCG_SWAP, 1); 511 - return 0; 512 - } 513 - return -EINVAL; 514 - } 515 - #else 516 - static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 517 - struct mem_cgroup *from, struct mem_cgroup *to) 518 - { 519 - return -EINVAL; 520 - } 521 - #endif 522 - 523 555 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 524 556 struct cftype *cft) 525 557 { ··· 505 613 struct cftype *cft, u64 val) 506 614 { 507 615 return -ENOSYS; 508 - } 509 - #endif 510 - 511 - #ifdef CONFIG_MMU 512 - /* Handlers for move charge at task migration. */ 513 - static int mem_cgroup_do_precharge(unsigned long count) 514 - { 515 - int ret; 516 - 517 - /* Try a single bulk charge without reclaim first, kswapd may wake */ 518 - ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); 519 - if (!ret) { 520 - mc.precharge += count; 521 - return ret; 522 - } 523 - 524 - /* Try charges one by one with reclaim, but do not retry */ 525 - while (count--) { 526 - ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); 527 - if (ret) 528 - return ret; 529 - mc.precharge++; 530 - cond_resched(); 531 - } 532 - return 0; 533 - } 534 - 535 - union mc_target { 536 - struct folio *folio; 537 - swp_entry_t ent; 538 - }; 539 - 540 - enum mc_target_type { 541 - MC_TARGET_NONE = 0, 542 - MC_TARGET_PAGE, 543 - MC_TARGET_SWAP, 544 - MC_TARGET_DEVICE, 545 - }; 546 - 547 - static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 548 - unsigned long addr, pte_t ptent) 549 - { 550 - struct page *page = vm_normal_page(vma, addr, ptent); 551 - 552 - if (!page) 553 - return NULL; 554 - if (PageAnon(page)) { 555 - if (!(mc.flags & MOVE_ANON)) 556 - return NULL; 557 - } else { 558 - if (!(mc.flags & MOVE_FILE)) 559 - return NULL; 560 - } 561 - get_page(page); 562 - 563 - return page; 564 - } 565 - 566 - #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) 567 - static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 568 - pte_t ptent, swp_entry_t *entry) 569 - { 570 - struct page *page = NULL; 571 - swp_entry_t ent = pte_to_swp_entry(ptent); 572 - 573 - if (!(mc.flags & MOVE_ANON)) 574 - return NULL; 575 - 576 - /* 577 - * Handle device private pages that are not accessible by the CPU, but 578 - * stored as special swap entries in the page table. 579 - */ 580 - if (is_device_private_entry(ent)) { 581 - page = pfn_swap_entry_to_page(ent); 582 - if (!get_page_unless_zero(page)) 583 - return NULL; 584 - return page; 585 - } 586 - 587 - if (non_swap_entry(ent)) 588 - return NULL; 589 - 590 - /* 591 - * Because swap_cache_get_folio() updates some statistics counter, 592 - * we call find_get_page() with swapper_space directly. 593 - */ 594 - page = find_get_page(swap_address_space(ent), swap_cache_index(ent)); 595 - entry->val = ent.val; 596 - 597 - return page; 598 - } 599 - #else 600 - static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 601 - pte_t ptent, swp_entry_t *entry) 602 - { 603 - return NULL; 604 - } 605 - #endif 606 - 607 - static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 608 - unsigned long addr, pte_t ptent) 609 - { 610 - unsigned long index; 611 - struct folio *folio; 612 - 613 - if (!vma->vm_file) /* anonymous vma */ 614 - return NULL; 615 - if (!(mc.flags & MOVE_FILE)) 616 - return NULL; 617 - 618 - /* folio is moved even if it's not RSS of this task(page-faulted). */ 619 - /* shmem/tmpfs may report page out on swap: account for that too. */ 620 - index = linear_page_index(vma, addr); 621 - folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index); 622 - if (IS_ERR(folio)) 623 - return NULL; 624 - return folio_file_page(folio, index); 625 - } 626 - 627 - static void memcg1_check_events(struct mem_cgroup *memcg, int nid); 628 - static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages); 629 - 630 - /** 631 - * mem_cgroup_move_account - move account of the folio 632 - * @folio: The folio. 633 - * @compound: charge the page as compound or small page 634 - * @from: mem_cgroup which the folio is moved from. 635 - * @to: mem_cgroup which the folio is moved to. @from != @to. 636 - * 637 - * The folio must be locked and not on the LRU. 638 - * 639 - * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 640 - * from old cgroup. 641 - */ 642 - static int mem_cgroup_move_account(struct folio *folio, 643 - bool compound, 644 - struct mem_cgroup *from, 645 - struct mem_cgroup *to) 646 - { 647 - struct lruvec *from_vec, *to_vec; 648 - struct pglist_data *pgdat; 649 - unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1; 650 - int nid, ret; 651 - 652 - VM_BUG_ON(from == to); 653 - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 654 - VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 655 - VM_BUG_ON(compound && !folio_test_large(folio)); 656 - 657 - ret = -EINVAL; 658 - if (folio_memcg(folio) != from) 659 - goto out; 660 - 661 - pgdat = folio_pgdat(folio); 662 - from_vec = mem_cgroup_lruvec(from, pgdat); 663 - to_vec = mem_cgroup_lruvec(to, pgdat); 664 - 665 - folio_memcg_lock(folio); 666 - 667 - if (folio_test_anon(folio)) { 668 - if (folio_mapped(folio)) { 669 - __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); 670 - __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); 671 - if (folio_test_pmd_mappable(folio)) { 672 - __mod_lruvec_state(from_vec, NR_ANON_THPS, 673 - -nr_pages); 674 - __mod_lruvec_state(to_vec, NR_ANON_THPS, 675 - nr_pages); 676 - } 677 - } 678 - } else { 679 - __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); 680 - __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); 681 - 682 - if (folio_test_swapbacked(folio)) { 683 - __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); 684 - __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); 685 - } 686 - 687 - if (folio_mapped(folio)) { 688 - __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); 689 - __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); 690 - } 691 - 692 - if (folio_test_dirty(folio)) { 693 - struct address_space *mapping = folio_mapping(folio); 694 - 695 - if (mapping_can_writeback(mapping)) { 696 - __mod_lruvec_state(from_vec, NR_FILE_DIRTY, 697 - -nr_pages); 698 - __mod_lruvec_state(to_vec, NR_FILE_DIRTY, 699 - nr_pages); 700 - } 701 - } 702 - } 703 - 704 - #ifdef CONFIG_SWAP 705 - if (folio_test_swapcache(folio)) { 706 - __mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages); 707 - __mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages); 708 - } 709 - #endif 710 - if (folio_test_writeback(folio)) { 711 - __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); 712 - __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); 713 - } 714 - 715 - /* 716 - * All state has been migrated, let's switch to the new memcg. 717 - * 718 - * It is safe to change page's memcg here because the page 719 - * is referenced, charged, isolated, and locked: we can't race 720 - * with (un)charging, migration, LRU putback, or anything else 721 - * that would rely on a stable page's memory cgroup. 722 - * 723 - * Note that folio_memcg_lock is a memcg lock, not a page lock, 724 - * to save space. As soon as we switch page's memory cgroup to a 725 - * new memcg that isn't locked, the above state can change 726 - * concurrently again. Make sure we're truly done with it. 727 - */ 728 - smp_mb(); 729 - 730 - css_get(&to->css); 731 - css_put(&from->css); 732 - 733 - /* Warning should never happen, so don't worry about refcount non-0 */ 734 - WARN_ON_ONCE(folio_unqueue_deferred_split(folio)); 735 - folio->memcg_data = (unsigned long)to; 736 - 737 - __folio_memcg_unlock(from); 738 - 739 - ret = 0; 740 - nid = folio_nid(folio); 741 - 742 - local_irq_disable(); 743 - memcg1_charge_statistics(to, nr_pages); 744 - memcg1_check_events(to, nid); 745 - memcg1_charge_statistics(from, -nr_pages); 746 - memcg1_check_events(from, nid); 747 - local_irq_enable(); 748 - out: 749 - return ret; 750 - } 751 - 752 - /** 753 - * get_mctgt_type - get target type of moving charge 754 - * @vma: the vma the pte to be checked belongs 755 - * @addr: the address corresponding to the pte to be checked 756 - * @ptent: the pte to be checked 757 - * @target: the pointer the target page or swap ent will be stored(can be NULL) 758 - * 759 - * Context: Called with pte lock held. 760 - * Return: 761 - * * MC_TARGET_NONE - If the pte is not a target for move charge. 762 - * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for 763 - * move charge. If @target is not NULL, the folio is stored in target->folio 764 - * with extra refcnt taken (Caller should release it). 765 - * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a 766 - * target for charge migration. If @target is not NULL, the entry is 767 - * stored in target->ent. 768 - * * MC_TARGET_DEVICE - Like MC_TARGET_PAGE but page is device memory and 769 - * thus not on the lru. For now such page is charged like a regular page 770 - * would be as it is just special memory taking the place of a regular page. 771 - * See Documentations/vm/hmm.txt and include/linux/hmm.h 772 - */ 773 - static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 774 - unsigned long addr, pte_t ptent, union mc_target *target) 775 - { 776 - struct page *page = NULL; 777 - struct folio *folio; 778 - enum mc_target_type ret = MC_TARGET_NONE; 779 - swp_entry_t ent = { .val = 0 }; 780 - 781 - if (pte_present(ptent)) 782 - page = mc_handle_present_pte(vma, addr, ptent); 783 - else if (pte_none_mostly(ptent)) 784 - /* 785 - * PTE markers should be treated as a none pte here, separated 786 - * from other swap handling below. 787 - */ 788 - page = mc_handle_file_pte(vma, addr, ptent); 789 - else if (is_swap_pte(ptent)) 790 - page = mc_handle_swap_pte(vma, ptent, &ent); 791 - 792 - if (page) 793 - folio = page_folio(page); 794 - if (target && page) { 795 - if (!folio_trylock(folio)) { 796 - folio_put(folio); 797 - return ret; 798 - } 799 - /* 800 - * page_mapped() must be stable during the move. This 801 - * pte is locked, so if it's present, the page cannot 802 - * become unmapped. If it isn't, we have only partial 803 - * control over the mapped state: the page lock will 804 - * prevent new faults against pagecache and swapcache, 805 - * so an unmapped page cannot become mapped. However, 806 - * if the page is already mapped elsewhere, it can 807 - * unmap, and there is nothing we can do about it. 808 - * Alas, skip moving the page in this case. 809 - */ 810 - if (!pte_present(ptent) && page_mapped(page)) { 811 - folio_unlock(folio); 812 - folio_put(folio); 813 - return ret; 814 - } 815 - } 816 - 817 - if (!page && !ent.val) 818 - return ret; 819 - if (page) { 820 - /* 821 - * Do only loose check w/o serialization. 822 - * mem_cgroup_move_account() checks the page is valid or 823 - * not under LRU exclusion. 824 - */ 825 - if (folio_memcg(folio) == mc.from) { 826 - ret = MC_TARGET_PAGE; 827 - if (folio_is_device_private(folio) || 828 - folio_is_device_coherent(folio)) 829 - ret = MC_TARGET_DEVICE; 830 - if (target) 831 - target->folio = folio; 832 - } 833 - if (!ret || !target) { 834 - if (target) 835 - folio_unlock(folio); 836 - folio_put(folio); 837 - } 838 - } 839 - /* 840 - * There is a swap entry and a page doesn't exist or isn't charged. 841 - * But we cannot move a tail-page in a THP. 842 - */ 843 - if (ent.val && !ret && (!page || !PageTransCompound(page)) && 844 - mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 845 - ret = MC_TARGET_SWAP; 846 - if (target) 847 - target->ent = ent; 848 - } 849 - return ret; 850 - } 851 - 852 - #ifdef CONFIG_TRANSPARENT_HUGEPAGE 853 - /* 854 - * We don't consider PMD mapped swapping or file mapped pages because THP does 855 - * not support them for now. 856 - * Caller should make sure that pmd_trans_huge(pmd) is true. 857 - */ 858 - static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 859 - unsigned long addr, pmd_t pmd, union mc_target *target) 860 - { 861 - struct page *page = NULL; 862 - struct folio *folio; 863 - enum mc_target_type ret = MC_TARGET_NONE; 864 - 865 - if (unlikely(is_swap_pmd(pmd))) { 866 - VM_BUG_ON(thp_migration_supported() && 867 - !is_pmd_migration_entry(pmd)); 868 - return ret; 869 - } 870 - page = pmd_page(pmd); 871 - VM_BUG_ON_PAGE(!page || !PageHead(page), page); 872 - folio = page_folio(page); 873 - if (!(mc.flags & MOVE_ANON)) 874 - return ret; 875 - if (folio_memcg(folio) == mc.from) { 876 - ret = MC_TARGET_PAGE; 877 - if (target) { 878 - folio_get(folio); 879 - if (!folio_trylock(folio)) { 880 - folio_put(folio); 881 - return MC_TARGET_NONE; 882 - } 883 - target->folio = folio; 884 - } 885 - } 886 - return ret; 887 - } 888 - #else 889 - static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 890 - unsigned long addr, pmd_t pmd, union mc_target *target) 891 - { 892 - return MC_TARGET_NONE; 893 - } 894 - #endif 895 - 896 - static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 897 - unsigned long addr, unsigned long end, 898 - struct mm_walk *walk) 899 - { 900 - struct vm_area_struct *vma = walk->vma; 901 - pte_t *pte; 902 - spinlock_t *ptl; 903 - 904 - ptl = pmd_trans_huge_lock(pmd, vma); 905 - if (ptl) { 906 - /* 907 - * Note their can not be MC_TARGET_DEVICE for now as we do not 908 - * support transparent huge page with MEMORY_DEVICE_PRIVATE but 909 - * this might change. 910 - */ 911 - if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 912 - mc.precharge += HPAGE_PMD_NR; 913 - spin_unlock(ptl); 914 - return 0; 915 - } 916 - 917 - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 918 - if (!pte) 919 - return 0; 920 - for (; addr != end; pte++, addr += PAGE_SIZE) 921 - if (get_mctgt_type(vma, addr, ptep_get(pte), NULL)) 922 - mc.precharge++; /* increment precharge temporarily */ 923 - pte_unmap_unlock(pte - 1, ptl); 924 - cond_resched(); 925 - 926 - return 0; 927 - } 928 - 929 - static const struct mm_walk_ops precharge_walk_ops = { 930 - .pmd_entry = mem_cgroup_count_precharge_pte_range, 931 - .walk_lock = PGWALK_RDLOCK, 932 - }; 933 - 934 - static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 935 - { 936 - unsigned long precharge; 937 - 938 - mmap_read_lock(mm); 939 - walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL); 940 - mmap_read_unlock(mm); 941 - 942 - precharge = mc.precharge; 943 - mc.precharge = 0; 944 - 945 - return precharge; 946 - } 947 - 948 - static int mem_cgroup_precharge_mc(struct mm_struct *mm) 949 - { 950 - unsigned long precharge = mem_cgroup_count_precharge(mm); 951 - 952 - VM_BUG_ON(mc.moving_task); 953 - mc.moving_task = current; 954 - return mem_cgroup_do_precharge(precharge); 955 - } 956 - 957 - /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 958 - static void __mem_cgroup_clear_mc(void) 959 - { 960 - struct mem_cgroup *from = mc.from; 961 - struct mem_cgroup *to = mc.to; 962 - 963 - /* we must uncharge all the leftover precharges from mc.to */ 964 - if (mc.precharge) { 965 - mem_cgroup_cancel_charge(mc.to, mc.precharge); 966 - mc.precharge = 0; 967 - } 968 - /* 969 - * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 970 - * we must uncharge here. 971 - */ 972 - if (mc.moved_charge) { 973 - mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 974 - mc.moved_charge = 0; 975 - } 976 - /* we must fixup refcnts and charges */ 977 - if (mc.moved_swap) { 978 - /* uncharge swap account from the old cgroup */ 979 - if (!mem_cgroup_is_root(mc.from)) 980 - page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 981 - 982 - mem_cgroup_id_put_many(mc.from, mc.moved_swap); 983 - 984 - /* 985 - * we charged both to->memory and to->memsw, so we 986 - * should uncharge to->memory. 987 - */ 988 - if (!mem_cgroup_is_root(mc.to)) 989 - page_counter_uncharge(&mc.to->memory, mc.moved_swap); 990 - 991 - mc.moved_swap = 0; 992 - } 993 - memcg1_oom_recover(from); 994 - memcg1_oom_recover(to); 995 - wake_up_all(&mc.waitq); 996 - } 997 - 998 - static void mem_cgroup_clear_mc(void) 999 - { 1000 - struct mm_struct *mm = mc.mm; 1001 - 1002 - /* 1003 - * we must clear moving_task before waking up waiters at the end of 1004 - * task migration. 1005 - */ 1006 - mc.moving_task = NULL; 1007 - __mem_cgroup_clear_mc(); 1008 - spin_lock(&mc.lock); 1009 - mc.from = NULL; 1010 - mc.to = NULL; 1011 - mc.mm = NULL; 1012 - spin_unlock(&mc.lock); 1013 - 1014 - mmput(mm); 1015 - } 1016 - 1017 - int memcg1_can_attach(struct cgroup_taskset *tset) 1018 - { 1019 - struct cgroup_subsys_state *css; 1020 - struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ 1021 - struct mem_cgroup *from; 1022 - struct task_struct *leader, *p; 1023 - struct mm_struct *mm; 1024 - unsigned long move_flags; 1025 - int ret = 0; 1026 - 1027 - /* charge immigration isn't supported on the default hierarchy */ 1028 - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) 1029 - return 0; 1030 - 1031 - /* 1032 - * Multi-process migrations only happen on the default hierarchy 1033 - * where charge immigration is not used. Perform charge 1034 - * immigration if @tset contains a leader and whine if there are 1035 - * multiple. 1036 - */ 1037 - p = NULL; 1038 - cgroup_taskset_for_each_leader(leader, css, tset) { 1039 - WARN_ON_ONCE(p); 1040 - p = leader; 1041 - memcg = mem_cgroup_from_css(css); 1042 - } 1043 - if (!p) 1044 - return 0; 1045 - 1046 - /* 1047 - * We are now committed to this value whatever it is. Changes in this 1048 - * tunable will only affect upcoming migrations, not the current one. 1049 - * So we need to save it, and keep it going. 1050 - */ 1051 - move_flags = READ_ONCE(memcg->move_charge_at_immigrate); 1052 - if (!move_flags) 1053 - return 0; 1054 - 1055 - from = mem_cgroup_from_task(p); 1056 - 1057 - VM_BUG_ON(from == memcg); 1058 - 1059 - mm = get_task_mm(p); 1060 - if (!mm) 1061 - return 0; 1062 - /* We move charges only when we move a owner of the mm */ 1063 - if (mm->owner == p) { 1064 - VM_BUG_ON(mc.from); 1065 - VM_BUG_ON(mc.to); 1066 - VM_BUG_ON(mc.precharge); 1067 - VM_BUG_ON(mc.moved_charge); 1068 - VM_BUG_ON(mc.moved_swap); 1069 - 1070 - spin_lock(&mc.lock); 1071 - mc.mm = mm; 1072 - mc.from = from; 1073 - mc.to = memcg; 1074 - mc.flags = move_flags; 1075 - spin_unlock(&mc.lock); 1076 - /* We set mc.moving_task later */ 1077 - 1078 - ret = mem_cgroup_precharge_mc(mm); 1079 - if (ret) 1080 - mem_cgroup_clear_mc(); 1081 - } else { 1082 - mmput(mm); 1083 - } 1084 - return ret; 1085 - } 1086 - 1087 - void memcg1_cancel_attach(struct cgroup_taskset *tset) 1088 - { 1089 - if (mc.to) 1090 - mem_cgroup_clear_mc(); 1091 - } 1092 - 1093 - static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 1094 - unsigned long addr, unsigned long end, 1095 - struct mm_walk *walk) 1096 - { 1097 - int ret = 0; 1098 - struct vm_area_struct *vma = walk->vma; 1099 - pte_t *pte; 1100 - spinlock_t *ptl; 1101 - enum mc_target_type target_type; 1102 - union mc_target target; 1103 - struct folio *folio; 1104 - bool tried_split_before = false; 1105 - 1106 - retry_pmd: 1107 - ptl = pmd_trans_huge_lock(pmd, vma); 1108 - if (ptl) { 1109 - if (mc.precharge < HPAGE_PMD_NR) { 1110 - spin_unlock(ptl); 1111 - return 0; 1112 - } 1113 - target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 1114 - if (target_type == MC_TARGET_PAGE) { 1115 - folio = target.folio; 1116 - /* 1117 - * Deferred split queue locking depends on memcg, 1118 - * and unqueue is unsafe unless folio refcount is 0: 1119 - * split or skip if on the queue? first try to split. 1120 - */ 1121 - if (!list_empty(&folio->_deferred_list)) { 1122 - spin_unlock(ptl); 1123 - if (!tried_split_before) 1124 - split_folio(folio); 1125 - folio_unlock(folio); 1126 - folio_put(folio); 1127 - if (tried_split_before) 1128 - return 0; 1129 - tried_split_before = true; 1130 - goto retry_pmd; 1131 - } 1132 - /* 1133 - * So long as that pmd lock is held, the folio cannot 1134 - * be racily added to the _deferred_list, because 1135 - * __folio_remove_rmap() will find !partially_mapped. 1136 - */ 1137 - if (folio_isolate_lru(folio)) { 1138 - if (!mem_cgroup_move_account(folio, true, 1139 - mc.from, mc.to)) { 1140 - mc.precharge -= HPAGE_PMD_NR; 1141 - mc.moved_charge += HPAGE_PMD_NR; 1142 - } 1143 - folio_putback_lru(folio); 1144 - } 1145 - folio_unlock(folio); 1146 - folio_put(folio); 1147 - } else if (target_type == MC_TARGET_DEVICE) { 1148 - folio = target.folio; 1149 - if (!mem_cgroup_move_account(folio, true, 1150 - mc.from, mc.to)) { 1151 - mc.precharge -= HPAGE_PMD_NR; 1152 - mc.moved_charge += HPAGE_PMD_NR; 1153 - } 1154 - folio_unlock(folio); 1155 - folio_put(folio); 1156 - } 1157 - spin_unlock(ptl); 1158 - return 0; 1159 - } 1160 - 1161 - retry: 1162 - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 1163 - if (!pte) 1164 - return 0; 1165 - for (; addr != end; addr += PAGE_SIZE) { 1166 - pte_t ptent = ptep_get(pte++); 1167 - bool device = false; 1168 - swp_entry_t ent; 1169 - 1170 - if (!mc.precharge) 1171 - break; 1172 - 1173 - switch (get_mctgt_type(vma, addr, ptent, &target)) { 1174 - case MC_TARGET_DEVICE: 1175 - device = true; 1176 - fallthrough; 1177 - case MC_TARGET_PAGE: 1178 - folio = target.folio; 1179 - /* 1180 - * We can have a part of the split pmd here. Moving it 1181 - * can be done but it would be too convoluted so simply 1182 - * ignore such a partial THP and keep it in original 1183 - * memcg. There should be somebody mapping the head. 1184 - */ 1185 - if (folio_test_large(folio)) 1186 - goto put; 1187 - if (!device && !folio_isolate_lru(folio)) 1188 - goto put; 1189 - if (!mem_cgroup_move_account(folio, false, 1190 - mc.from, mc.to)) { 1191 - mc.precharge--; 1192 - /* we uncharge from mc.from later. */ 1193 - mc.moved_charge++; 1194 - } 1195 - if (!device) 1196 - folio_putback_lru(folio); 1197 - put: /* get_mctgt_type() gets & locks the page */ 1198 - folio_unlock(folio); 1199 - folio_put(folio); 1200 - break; 1201 - case MC_TARGET_SWAP: 1202 - ent = target.ent; 1203 - if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 1204 - mc.precharge--; 1205 - mem_cgroup_id_get_many(mc.to, 1); 1206 - /* we fixup other refcnts and charges later. */ 1207 - mc.moved_swap++; 1208 - } 1209 - break; 1210 - default: 1211 - break; 1212 - } 1213 - } 1214 - pte_unmap_unlock(pte - 1, ptl); 1215 - cond_resched(); 1216 - 1217 - if (addr != end) { 1218 - /* 1219 - * We have consumed all precharges we got in can_attach(). 1220 - * We try charge one by one, but don't do any additional 1221 - * charges to mc.to if we have failed in charge once in attach() 1222 - * phase. 1223 - */ 1224 - ret = mem_cgroup_do_precharge(1); 1225 - if (!ret) 1226 - goto retry; 1227 - } 1228 - 1229 - return ret; 1230 - } 1231 - 1232 - static const struct mm_walk_ops charge_walk_ops = { 1233 - .pmd_entry = mem_cgroup_move_charge_pte_range, 1234 - .walk_lock = PGWALK_RDLOCK, 1235 - }; 1236 - 1237 - static void mem_cgroup_move_charge(void) 1238 - { 1239 - lru_add_drain_all(); 1240 - /* 1241 - * Signal folio_memcg_lock() to take the memcg's move_lock 1242 - * while we're moving its pages to another memcg. Then wait 1243 - * for already started RCU-only updates to finish. 1244 - */ 1245 - atomic_inc(&mc.from->moving_account); 1246 - synchronize_rcu(); 1247 - retry: 1248 - if (unlikely(!mmap_read_trylock(mc.mm))) { 1249 - /* 1250 - * Someone who are holding the mmap_lock might be waiting in 1251 - * waitq. So we cancel all extra charges, wake up all waiters, 1252 - * and retry. Because we cancel precharges, we might not be able 1253 - * to move enough charges, but moving charge is a best-effort 1254 - * feature anyway, so it wouldn't be a big problem. 1255 - */ 1256 - __mem_cgroup_clear_mc(); 1257 - cond_resched(); 1258 - goto retry; 1259 - } 1260 - /* 1261 - * When we have consumed all precharges and failed in doing 1262 - * additional charge, the page walk just aborts. 1263 - */ 1264 - walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL); 1265 - mmap_read_unlock(mc.mm); 1266 - atomic_dec(&mc.from->moving_account); 1267 - } 1268 - 1269 - void memcg1_move_task(void) 1270 - { 1271 - if (mc.to) { 1272 - mem_cgroup_move_charge(); 1273 - mem_cgroup_clear_mc(); 1274 - } 1275 - } 1276 - 1277 - #else /* !CONFIG_MMU */ 1278 - int memcg1_can_attach(struct cgroup_taskset *tset) 1279 - { 1280 - return 0; 1281 - } 1282 - void memcg1_cancel_attach(struct cgroup_taskset *tset) 1283 - { 1284 - } 1285 - void memcg1_move_task(void) 1286 - { 1287 616 } 1288 617 #endif 1289 618

-6

mm/memcontrol-v1.h

··· 80 80 WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX); 81 81 } 82 82 83 - bool memcg1_wait_acct_move(struct mem_cgroup *memcg); 84 - 85 83 struct cgroup_taskset; 86 - int memcg1_can_attach(struct cgroup_taskset *tset); 87 - void memcg1_cancel_attach(struct cgroup_taskset *tset); 88 - void memcg1_move_task(void); 89 84 void memcg1_css_offline(struct mem_cgroup *memcg); 90 85 91 86 /* for encoding cft->private value on file */ ··· 125 130 static inline void memcg1_memcg_init(struct mem_cgroup *memcg) {} 126 131 static inline void memcg1_remove_from_trees(struct mem_cgroup *memcg) {} 127 132 static inline void memcg1_soft_limit_reset(struct mem_cgroup *memcg) {} 128 - static inline bool memcg1_wait_acct_move(struct mem_cgroup *memcg) { return false; } 129 133 static inline void memcg1_css_offline(struct mem_cgroup *memcg) {} 130 134 131 135 static inline bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked) { return true; }

-9

mm/memcontrol.c

··· 2242 2242 */ 2243 2243 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2244 2244 goto retry; 2245 - /* 2246 - * At task move, charge accounts can be doubly counted. So, it's 2247 - * better to wait until the end of task_move if something is going on. 2248 - */ 2249 - if (memcg1_wait_acct_move(mem_over_limit)) 2250 - goto retry; 2251 2245 2252 2246 if (nr_retries--) 2253 2247 goto retry; ··· 4435 4441 .exit = mem_cgroup_exit, 4436 4442 .dfl_cftypes = memory_files, 4437 4443 #ifdef CONFIG_MEMCG_V1 4438 - .can_attach = memcg1_can_attach, 4439 - .cancel_attach = memcg1_cancel_attach, 4440 - .post_attach = memcg1_move_task, 4441 4444 .legacy_cftypes = mem_cgroup_legacy_files, 4442 4445 #endif 4443 4446 .early_init = 0,

Configure Feed

Configure Feed