Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm/huge_memory.c: introduce folio_split_unmapped

Unmapped was added as a parameter to __folio_split() and related call
sites to support splitting of folios already in the midst of a migration.
This special case arose for device private folio migration since during
migration there could be a disconnect between source and destination on
the folio size.

Introduce folio_split_unmapped() to handle this special case. Also
refactor code and add __folio_freeze_and_split_unmapped() helper that is
common to both __folio_split() and folio_split_unmapped().

This in turn removes the special casing introduced by the unmapped
parameter in __folio_split().

[balbirs@nvidia.com: v2]
Link: https://lkml.kernel.org/r/20251115084041.3914728-1-balbirs@nvidia.com
[balbirs@nvidia.com: fix clang-20 build]
Link: https://lkml.kernel.org/r/20251120134232.3588203-1-balbirs@nvidia.com
[akpm@linux-foundation.org: add `inline' to shmem_uncharge() stub, per Balbir]
Link: https://lkml.kernel.org/r/20251114012228.2634882-1-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Suggested-by: Zi Yan <ziy@nvidia.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Balbir Singh and committed by
Andrew Morton
cab812d9 8826f096

+211 -151
+3 -2
include/linux/huge_mm.h
··· 371 371 372 372 bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins); 373 373 int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, 374 - unsigned int new_order, bool unmapped); 374 + unsigned int new_order); 375 + int folio_split_unmapped(struct folio *folio, unsigned int new_order); 375 376 int min_order_for_split(struct folio *folio); 376 377 int split_folio_to_list(struct folio *folio, struct list_head *list); 377 378 bool folio_split_supported(struct folio *folio, unsigned int new_order, ··· 383 382 static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, 384 383 unsigned int new_order) 385 384 { 386 - return __split_huge_page_to_list_to_order(page, list, new_order, false); 385 + return __split_huge_page_to_list_to_order(page, list, new_order); 387 386 } 388 387 static inline int split_huge_page_to_order(struct page *page, unsigned int new_order) 389 388 {
+5 -1
include/linux/shmem_fs.h
··· 136 136 137 137 #ifdef CONFIG_SHMEM 138 138 extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); 139 + extern void shmem_uncharge(struct inode *inode, long pages); 139 140 #else 140 141 static inline unsigned long shmem_swap_usage(struct vm_area_struct *vma) 141 142 { 142 143 return 0; 144 + } 145 + 146 + static inline void shmem_uncharge(struct inode *inode, long pages) 147 + { 143 148 } 144 149 #endif 145 150 extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, ··· 199 194 } 200 195 201 196 extern bool shmem_charge(struct inode *inode, long pages); 202 - extern void shmem_uncharge(struct inode *inode, long pages); 203 197 204 198 #ifdef CONFIG_USERFAULTFD 205 199 #ifdef CONFIG_SHMEM
+202 -146
mm/huge_memory.c
··· 3739 3739 return true; 3740 3740 } 3741 3741 3742 + static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order, 3743 + struct page *split_at, struct xa_state *xas, 3744 + struct address_space *mapping, bool do_lru, 3745 + struct list_head *list, enum split_type split_type, 3746 + pgoff_t end, int *nr_shmem_dropped, int extra_pins) 3747 + { 3748 + struct folio *end_folio = folio_next(folio); 3749 + struct folio *new_folio, *next; 3750 + int old_order = folio_order(folio); 3751 + int ret = 0; 3752 + struct deferred_split *ds_queue; 3753 + 3754 + VM_WARN_ON_ONCE(!mapping && end); 3755 + /* Prevent deferred_split_scan() touching ->_refcount */ 3756 + ds_queue = folio_split_queue_lock(folio); 3757 + if (folio_ref_freeze(folio, 1 + extra_pins)) { 3758 + struct swap_cluster_info *ci = NULL; 3759 + struct lruvec *lruvec; 3760 + int expected_refs; 3761 + 3762 + if (old_order > 1) { 3763 + if (!list_empty(&folio->_deferred_list)) { 3764 + ds_queue->split_queue_len--; 3765 + /* 3766 + * Reinitialize page_deferred_list after removing the 3767 + * page from the split_queue, otherwise a subsequent 3768 + * split will see list corruption when checking the 3769 + * page_deferred_list. 3770 + */ 3771 + list_del_init(&folio->_deferred_list); 3772 + } 3773 + if (folio_test_partially_mapped(folio)) { 3774 + folio_clear_partially_mapped(folio); 3775 + mod_mthp_stat(old_order, 3776 + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); 3777 + } 3778 + } 3779 + split_queue_unlock(ds_queue); 3780 + if (mapping) { 3781 + int nr = folio_nr_pages(folio); 3782 + 3783 + if (folio_test_pmd_mappable(folio) && 3784 + new_order < HPAGE_PMD_ORDER) { 3785 + if (folio_test_swapbacked(folio)) { 3786 + __lruvec_stat_mod_folio(folio, 3787 + NR_SHMEM_THPS, -nr); 3788 + } else { 3789 + __lruvec_stat_mod_folio(folio, 3790 + NR_FILE_THPS, -nr); 3791 + filemap_nr_thps_dec(mapping); 3792 + } 3793 + } 3794 + } 3795 + 3796 + if (folio_test_swapcache(folio)) { 3797 + if (mapping) { 3798 + VM_WARN_ON_ONCE_FOLIO(mapping, folio); 3799 + return -EINVAL; 3800 + } 3801 + 3802 + ci = swap_cluster_get_and_lock(folio); 3803 + } 3804 + 3805 + /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ 3806 + if (do_lru) 3807 + lruvec = folio_lruvec_lock(folio); 3808 + 3809 + ret = __split_unmapped_folio(folio, new_order, split_at, xas, 3810 + mapping, split_type); 3811 + 3812 + /* 3813 + * Unfreeze after-split folios and put them back to the right 3814 + * list. @folio should be kept frozon until page cache 3815 + * entries are updated with all the other after-split folios 3816 + * to prevent others seeing stale page cache entries. 3817 + * As a result, new_folio starts from the next folio of 3818 + * @folio. 3819 + */ 3820 + for (new_folio = folio_next(folio); new_folio != end_folio; 3821 + new_folio = next) { 3822 + unsigned long nr_pages = folio_nr_pages(new_folio); 3823 + 3824 + next = folio_next(new_folio); 3825 + 3826 + zone_device_private_split_cb(folio, new_folio); 3827 + 3828 + expected_refs = folio_expected_ref_count(new_folio) + 1; 3829 + folio_ref_unfreeze(new_folio, expected_refs); 3830 + 3831 + if (do_lru) 3832 + lru_add_split_folio(folio, new_folio, lruvec, list); 3833 + 3834 + /* 3835 + * Anonymous folio with swap cache. 3836 + * NOTE: shmem in swap cache is not supported yet. 3837 + */ 3838 + if (ci) { 3839 + __swap_cache_replace_folio(ci, folio, new_folio); 3840 + continue; 3841 + } 3842 + 3843 + /* Anonymous folio without swap cache */ 3844 + if (!mapping) 3845 + continue; 3846 + 3847 + /* Add the new folio to the page cache. */ 3848 + if (new_folio->index < end) { 3849 + __xa_store(&mapping->i_pages, new_folio->index, 3850 + new_folio, 0); 3851 + continue; 3852 + } 3853 + 3854 + VM_WARN_ON_ONCE(!nr_shmem_dropped); 3855 + /* Drop folio beyond EOF: ->index >= end */ 3856 + if (shmem_mapping(mapping) && nr_shmem_dropped) 3857 + *nr_shmem_dropped += nr_pages; 3858 + else if (folio_test_clear_dirty(new_folio)) 3859 + folio_account_cleaned( 3860 + new_folio, inode_to_wb(mapping->host)); 3861 + __filemap_remove_folio(new_folio, NULL); 3862 + folio_put_refs(new_folio, nr_pages); 3863 + } 3864 + 3865 + zone_device_private_split_cb(folio, NULL); 3866 + /* 3867 + * Unfreeze @folio only after all page cache entries, which 3868 + * used to point to it, have been updated with new folios. 3869 + * Otherwise, a parallel folio_try_get() can grab @folio 3870 + * and its caller can see stale page cache entries. 3871 + */ 3872 + expected_refs = folio_expected_ref_count(folio) + 1; 3873 + folio_ref_unfreeze(folio, expected_refs); 3874 + 3875 + if (do_lru) 3876 + unlock_page_lruvec(lruvec); 3877 + 3878 + if (ci) 3879 + swap_cluster_unlock(ci); 3880 + } else { 3881 + split_queue_unlock(ds_queue); 3882 + return -EAGAIN; 3883 + } 3884 + 3885 + return ret; 3886 + } 3887 + 3742 3888 /** 3743 3889 * __folio_split() - split a folio at @split_at to a @new_order folio 3744 3890 * @folio: folio to split ··· 3893 3747 * @lock_at: a page within @folio to be left locked to caller 3894 3748 * @list: after-split folios will be put on it if non NULL 3895 3749 * @split_type: perform uniform split or not (non-uniform split) 3896 - * @unmapped: The pages are already unmapped, they are migration entries. 3897 3750 * 3898 3751 * It calls __split_unmapped_folio() to perform uniform and non-uniform split. 3899 3752 * It is in charge of checking whether the split is supported or not and ··· 3908 3763 */ 3909 3764 static int __folio_split(struct folio *folio, unsigned int new_order, 3910 3765 struct page *split_at, struct page *lock_at, 3911 - struct list_head *list, enum split_type split_type, bool unmapped) 3766 + struct list_head *list, enum split_type split_type) 3912 3767 { 3913 - struct deferred_split *ds_queue; 3914 3768 XA_STATE(xas, &folio->mapping->i_pages, folio->index); 3915 3769 struct folio *end_folio = folio_next(folio); 3916 3770 bool is_anon = folio_test_anon(folio); ··· 3920 3776 int nr_shmem_dropped = 0; 3921 3777 int remap_flags = 0; 3922 3778 int extra_pins, ret; 3923 - pgoff_t end; 3779 + pgoff_t end = 0; 3924 3780 bool is_hzp; 3925 3781 3926 3782 VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); ··· 3963 3819 * is taken to serialise against parallel split or collapse 3964 3820 * operations. 3965 3821 */ 3966 - if (!unmapped) { 3967 - anon_vma = folio_get_anon_vma(folio); 3968 - if (!anon_vma) { 3969 - ret = -EBUSY; 3970 - goto out; 3971 - } 3972 - anon_vma_lock_write(anon_vma); 3822 + anon_vma = folio_get_anon_vma(folio); 3823 + if (!anon_vma) { 3824 + ret = -EBUSY; 3825 + goto out; 3973 3826 } 3827 + anon_vma_lock_write(anon_vma); 3974 3828 mapping = NULL; 3975 3829 } else { 3976 3830 unsigned int min_order; ··· 4022 3880 goto out_unlock; 4023 3881 } 4024 3882 4025 - if (!unmapped) 4026 - unmap_folio(folio); 3883 + unmap_folio(folio); 4027 3884 4028 3885 /* block interrupt reentry in xa_lock and spinlock */ 4029 3886 local_irq_disable(); ··· 4039 3898 } 4040 3899 } 4041 3900 4042 - /* Prevent deferred_split_scan() touching ->_refcount */ 4043 - ds_queue = folio_split_queue_lock(folio); 4044 - if (folio_ref_freeze(folio, 1 + extra_pins)) { 4045 - struct swap_cluster_info *ci = NULL; 4046 - struct lruvec *lruvec; 4047 - int expected_refs; 4048 - 4049 - if (old_order > 1) { 4050 - if (!list_empty(&folio->_deferred_list)) { 4051 - ds_queue->split_queue_len--; 4052 - /* 4053 - * Reinitialize page_deferred_list after removing the 4054 - * page from the split_queue, otherwise a subsequent 4055 - * split will see list corruption when checking the 4056 - * page_deferred_list. 4057 - */ 4058 - list_del_init(&folio->_deferred_list); 4059 - } 4060 - if (folio_test_partially_mapped(folio)) { 4061 - folio_clear_partially_mapped(folio); 4062 - mod_mthp_stat(old_order, 4063 - MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); 4064 - } 4065 - } 4066 - split_queue_unlock(ds_queue); 4067 - if (mapping) { 4068 - int nr = folio_nr_pages(folio); 4069 - 4070 - if (folio_test_pmd_mappable(folio) && 4071 - new_order < HPAGE_PMD_ORDER) { 4072 - if (folio_test_swapbacked(folio)) { 4073 - __lruvec_stat_mod_folio(folio, 4074 - NR_SHMEM_THPS, -nr); 4075 - } else { 4076 - __lruvec_stat_mod_folio(folio, 4077 - NR_FILE_THPS, -nr); 4078 - filemap_nr_thps_dec(mapping); 4079 - } 4080 - } 4081 - } 4082 - 4083 - if (folio_test_swapcache(folio)) { 4084 - if (mapping) { 4085 - VM_WARN_ON_ONCE_FOLIO(mapping, folio); 4086 - ret = -EINVAL; 4087 - goto fail; 4088 - } 4089 - 4090 - ci = swap_cluster_get_and_lock(folio); 4091 - } 4092 - 4093 - /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ 4094 - lruvec = folio_lruvec_lock(folio); 4095 - 4096 - ret = __split_unmapped_folio(folio, new_order, split_at, &xas, 4097 - mapping, split_type); 4098 - 4099 - /* 4100 - * Unfreeze after-split folios and put them back to the right 4101 - * list. @folio should be kept frozon until page cache 4102 - * entries are updated with all the other after-split folios 4103 - * to prevent others seeing stale page cache entries. 4104 - * As a result, new_folio starts from the next folio of 4105 - * @folio. 4106 - */ 4107 - for (new_folio = folio_next(folio); new_folio != end_folio; 4108 - new_folio = next) { 4109 - unsigned long nr_pages = folio_nr_pages(new_folio); 4110 - 4111 - next = folio_next(new_folio); 4112 - 4113 - zone_device_private_split_cb(folio, new_folio); 4114 - 4115 - expected_refs = folio_expected_ref_count(new_folio) + 1; 4116 - folio_ref_unfreeze(new_folio, expected_refs); 4117 - 4118 - if (!unmapped) 4119 - lru_add_split_folio(folio, new_folio, lruvec, list); 4120 - 4121 - /* 4122 - * Anonymous folio with swap cache. 4123 - * NOTE: shmem in swap cache is not supported yet. 4124 - */ 4125 - if (ci) { 4126 - __swap_cache_replace_folio(ci, folio, new_folio); 4127 - continue; 4128 - } 4129 - 4130 - /* Anonymous folio without swap cache */ 4131 - if (!mapping) 4132 - continue; 4133 - 4134 - /* Add the new folio to the page cache. */ 4135 - if (new_folio->index < end) { 4136 - __xa_store(&mapping->i_pages, new_folio->index, 4137 - new_folio, 0); 4138 - continue; 4139 - } 4140 - 4141 - /* Drop folio beyond EOF: ->index >= end */ 4142 - if (shmem_mapping(mapping)) 4143 - nr_shmem_dropped += nr_pages; 4144 - else if (folio_test_clear_dirty(new_folio)) 4145 - folio_account_cleaned( 4146 - new_folio, inode_to_wb(mapping->host)); 4147 - __filemap_remove_folio(new_folio, NULL); 4148 - folio_put_refs(new_folio, nr_pages); 4149 - } 4150 - 4151 - zone_device_private_split_cb(folio, NULL); 4152 - /* 4153 - * Unfreeze @folio only after all page cache entries, which 4154 - * used to point to it, have been updated with new folios. 4155 - * Otherwise, a parallel folio_try_get() can grab @folio 4156 - * and its caller can see stale page cache entries. 4157 - */ 4158 - expected_refs = folio_expected_ref_count(folio) + 1; 4159 - folio_ref_unfreeze(folio, expected_refs); 4160 - 4161 - unlock_page_lruvec(lruvec); 4162 - 4163 - if (ci) 4164 - swap_cluster_unlock(ci); 4165 - } else { 4166 - split_queue_unlock(ds_queue); 4167 - ret = -EAGAIN; 4168 - } 3901 + ret = __folio_freeze_and_split_unmapped(folio, new_order, split_at, &xas, mapping, 3902 + true, list, split_type, end, &nr_shmem_dropped, 3903 + extra_pins); 4169 3904 fail: 4170 3905 if (mapping) 4171 3906 xas_unlock(&xas); 4172 3907 4173 3908 local_irq_enable(); 4174 - 4175 - if (unmapped) 4176 - return ret; 4177 3909 4178 3910 if (nr_shmem_dropped) 4179 3911 shmem_uncharge(mapping->host, nr_shmem_dropped); ··· 4088 4074 if (old_order == HPAGE_PMD_ORDER) 4089 4075 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); 4090 4076 count_mthp_stat(old_order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED); 4077 + return ret; 4078 + } 4079 + 4080 + /** 4081 + * folio_split_unmapped() - split a large anon folio that is already unmapped 4082 + * @folio: folio to split 4083 + * @new_order: the order of folios after split 4084 + * 4085 + * This function is a helper for splitting folios that have already been 4086 + * unmapped. The use case is that the device or the CPU can refuse to migrate 4087 + * THP pages in the middle of migration, due to allocation issues on either 4088 + * side. 4089 + * 4090 + * anon_vma_lock is not required to be held, mmap_read_lock() or 4091 + * mmap_write_lock() should be held. @folio is expected to be locked by the 4092 + * caller. device-private and non device-private folios are supported along 4093 + * with folios that are in the swapcache. @folio should also be unmapped and 4094 + * isolated from LRU (if applicable) 4095 + * 4096 + * Upon return, the folio is not remapped, split folios are not added to LRU, 4097 + * free_folio_and_swap_cache() is not called, and new folios remain locked. 4098 + * 4099 + * Return: 0 on success, -EAGAIN if the folio cannot be split (e.g., due to 4100 + * insufficient reference count or extra pins). 4101 + */ 4102 + int folio_split_unmapped(struct folio *folio, unsigned int new_order) 4103 + { 4104 + int extra_pins, ret = 0; 4105 + 4106 + VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio); 4107 + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); 4108 + VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio); 4109 + VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(folio), folio); 4110 + 4111 + if (!can_split_folio(folio, 1, &extra_pins)) 4112 + return -EAGAIN; 4113 + 4114 + local_irq_disable(); 4115 + ret = __folio_freeze_and_split_unmapped(folio, new_order, &folio->page, NULL, 4116 + NULL, false, NULL, SPLIT_TYPE_UNIFORM, 4117 + 0, NULL, extra_pins); 4118 + local_irq_enable(); 4091 4119 return ret; 4092 4120 } 4093 4121 ··· 4181 4125 * with the folio. Splitting to order 0 is compatible with all folios. 4182 4126 */ 4183 4127 int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, 4184 - unsigned int new_order, bool unmapped) 4128 + unsigned int new_order) 4185 4129 { 4186 4130 struct folio *folio = page_folio(page); 4187 4131 4188 4132 return __folio_split(folio, new_order, &folio->page, page, list, 4189 - SPLIT_TYPE_UNIFORM, unmapped); 4133 + SPLIT_TYPE_UNIFORM); 4190 4134 } 4191 4135 4192 4136 /** ··· 4217 4161 struct page *split_at, struct list_head *list) 4218 4162 { 4219 4163 return __folio_split(folio, new_order, split_at, &folio->page, list, 4220 - SPLIT_TYPE_NON_UNIFORM, false); 4164 + SPLIT_TYPE_NON_UNIFORM); 4221 4165 } 4222 4166 4223 4167 int min_order_for_split(struct folio *folio)
+1 -2
mm/migrate_device.c
··· 916 916 917 917 folio_get(folio); 918 918 split_huge_pmd_address(migrate->vma, addr, true); 919 - ret = __split_huge_page_to_list_to_order(folio_page(folio, 0), NULL, 920 - 0, true); 919 + ret = folio_split_unmapped(folio, 0); 921 920 if (ret) 922 921 return ret; 923 922 migrate->src[idx] &= ~MIGRATE_PFN_COMPOUND;