Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

lib/test_hmm: add zone device private THP test infrastructure

Enhance the hmm test driver (lib/test_hmm) with support for THP pages.

A new pool of free_folios() has now been added to the dmirror device,
which can be allocated when a request for a THP zone device private page
is made.

Add compound page awareness to the allocation function during normal
migration and fault based migration. These routines also copy
folio_nr_pages() when moving data between system memory and device memory.

args.src and args.dst used to hold migration entries are now dynamically
allocated (as they need to hold HPAGE_PMD_NR entries or more).

Split and migrate support will be added in future patches in this series.

Link: https://lkml.kernel.org/r/20251001065707.920170-10-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Balbir Singh and committed by
Andrew Morton
775465fd 49640991

+304 -76
+12
include/linux/memremap.h
··· 177 177 folio->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; 178 178 } 179 179 180 + static inline void *folio_zone_device_data(const struct folio *folio) 181 + { 182 + VM_WARN_ON_FOLIO(!folio_is_device_private(folio), folio); 183 + return folio->page.zone_device_data; 184 + } 185 + 186 + static inline void folio_set_zone_device_data(struct folio *folio, void *data) 187 + { 188 + VM_WARN_ON_FOLIO(!folio_is_device_private(folio), folio); 189 + folio->page.zone_device_data = data; 190 + } 191 + 180 192 static inline bool is_pci_p2pdma_page(const struct page *page) 181 193 { 182 194 return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
+292 -76
lib/test_hmm.c
··· 119 119 unsigned long calloc; 120 120 unsigned long cfree; 121 121 struct page *free_pages; 122 + struct folio *free_folios; 122 123 spinlock_t lock; /* protects the above */ 123 124 }; 124 125 ··· 493 492 } 494 493 495 494 static int dmirror_allocate_chunk(struct dmirror_device *mdevice, 496 - struct page **ppage) 495 + struct page **ppage, bool is_large) 497 496 { 498 497 struct dmirror_chunk *devmem; 499 498 struct resource *res = NULL; ··· 573 572 pfn_first, pfn_last); 574 573 575 574 spin_lock(&mdevice->lock); 576 - for (pfn = pfn_first; pfn < pfn_last; pfn++) { 575 + for (pfn = pfn_first; pfn < pfn_last; ) { 577 576 struct page *page = pfn_to_page(pfn); 577 + 578 + if (is_large && IS_ALIGNED(pfn, HPAGE_PMD_NR) 579 + && (pfn + HPAGE_PMD_NR <= pfn_last)) { 580 + page->zone_device_data = mdevice->free_folios; 581 + mdevice->free_folios = page_folio(page); 582 + pfn += HPAGE_PMD_NR; 583 + continue; 584 + } 578 585 579 586 page->zone_device_data = mdevice->free_pages; 580 587 mdevice->free_pages = page; 588 + pfn++; 581 589 } 590 + 591 + ret = 0; 582 592 if (ppage) { 583 - *ppage = mdevice->free_pages; 584 - mdevice->free_pages = (*ppage)->zone_device_data; 585 - mdevice->calloc++; 593 + if (is_large) { 594 + if (!mdevice->free_folios) { 595 + ret = -ENOMEM; 596 + goto err_unlock; 597 + } 598 + *ppage = folio_page(mdevice->free_folios, 0); 599 + mdevice->free_folios = (*ppage)->zone_device_data; 600 + mdevice->calloc += HPAGE_PMD_NR; 601 + } else if (mdevice->free_pages) { 602 + *ppage = mdevice->free_pages; 603 + mdevice->free_pages = (*ppage)->zone_device_data; 604 + mdevice->calloc++; 605 + } else { 606 + ret = -ENOMEM; 607 + goto err_unlock; 608 + } 586 609 } 610 + err_unlock: 587 611 spin_unlock(&mdevice->lock); 588 612 589 - return 0; 613 + return ret; 590 614 591 615 err_release: 592 616 mutex_unlock(&mdevice->devmem_lock); ··· 624 598 return ret; 625 599 } 626 600 627 - static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) 601 + static struct page *dmirror_devmem_alloc_page(struct dmirror *dmirror, 602 + bool is_large) 628 603 { 629 604 struct page *dpage = NULL; 630 605 struct page *rpage = NULL; 606 + unsigned int order = is_large ? HPAGE_PMD_ORDER : 0; 607 + struct dmirror_device *mdevice = dmirror->mdevice; 631 608 632 609 /* 633 610 * For ZONE_DEVICE private type, this is a fake device so we allocate ··· 639 610 * data and ignore rpage. 640 611 */ 641 612 if (dmirror_is_private_zone(mdevice)) { 642 - rpage = alloc_page(GFP_HIGHUSER); 613 + rpage = folio_page(folio_alloc(GFP_HIGHUSER, order), 0); 643 614 if (!rpage) 644 615 return NULL; 645 616 } 646 617 spin_lock(&mdevice->lock); 647 618 648 - if (mdevice->free_pages) { 619 + if (is_large && mdevice->free_folios) { 620 + dpage = folio_page(mdevice->free_folios, 0); 621 + mdevice->free_folios = dpage->zone_device_data; 622 + mdevice->calloc += 1 << order; 623 + spin_unlock(&mdevice->lock); 624 + } else if (!is_large && mdevice->free_pages) { 649 625 dpage = mdevice->free_pages; 650 626 mdevice->free_pages = dpage->zone_device_data; 651 627 mdevice->calloc++; 652 628 spin_unlock(&mdevice->lock); 653 629 } else { 654 630 spin_unlock(&mdevice->lock); 655 - if (dmirror_allocate_chunk(mdevice, &dpage)) 631 + if (dmirror_allocate_chunk(mdevice, &dpage, is_large)) 656 632 goto error; 657 633 } 658 634 659 - zone_device_page_init(dpage, 0); 635 + zone_device_folio_init(page_folio(dpage), order); 660 636 dpage->zone_device_data = rpage; 661 637 return dpage; 662 638 663 639 error: 664 640 if (rpage) 665 - __free_page(rpage); 641 + __free_pages(rpage, order); 666 642 return NULL; 667 643 } 668 644 669 645 static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, 670 646 struct dmirror *dmirror) 671 647 { 672 - struct dmirror_device *mdevice = dmirror->mdevice; 673 648 const unsigned long *src = args->src; 674 649 unsigned long *dst = args->dst; 675 650 unsigned long addr; 676 651 677 - for (addr = args->start; addr < args->end; addr += PAGE_SIZE, 678 - src++, dst++) { 652 + for (addr = args->start; addr < args->end; ) { 679 653 struct page *spage; 680 654 struct page *dpage; 681 655 struct page *rpage; 656 + bool is_large = *src & MIGRATE_PFN_COMPOUND; 657 + int write = (*src & MIGRATE_PFN_WRITE) ? MIGRATE_PFN_WRITE : 0; 658 + unsigned long nr = 1; 682 659 683 660 if (!(*src & MIGRATE_PFN_MIGRATE)) 684 - continue; 661 + goto next; 685 662 686 663 /* 687 664 * Note that spage might be NULL which is OK since it is an ··· 697 662 if (WARN(spage && is_zone_device_page(spage), 698 663 "page already in device spage pfn: 0x%lx\n", 699 664 page_to_pfn(spage))) 700 - continue; 665 + goto next; 701 666 702 - dpage = dmirror_devmem_alloc_page(mdevice); 703 - if (!dpage) 667 + dpage = dmirror_devmem_alloc_page(dmirror, is_large); 668 + if (!dpage) { 669 + struct folio *folio; 670 + unsigned long i; 671 + unsigned long spfn = *src >> MIGRATE_PFN_SHIFT; 672 + struct page *src_page; 673 + 674 + if (!is_large) 675 + goto next; 676 + 677 + if (!spage && is_large) { 678 + nr = HPAGE_PMD_NR; 679 + } else { 680 + folio = page_folio(spage); 681 + nr = folio_nr_pages(folio); 682 + } 683 + 684 + for (i = 0; i < nr && addr < args->end; i++) { 685 + dpage = dmirror_devmem_alloc_page(dmirror, false); 686 + rpage = BACKING_PAGE(dpage); 687 + rpage->zone_device_data = dmirror; 688 + 689 + *dst = migrate_pfn(page_to_pfn(dpage)) | write; 690 + src_page = pfn_to_page(spfn + i); 691 + 692 + if (spage) 693 + copy_highpage(rpage, src_page); 694 + else 695 + clear_highpage(rpage); 696 + src++; 697 + dst++; 698 + addr += PAGE_SIZE; 699 + } 704 700 continue; 701 + } 705 702 706 703 rpage = BACKING_PAGE(dpage); 707 - if (spage) 708 - copy_highpage(rpage, spage); 709 - else 710 - clear_highpage(rpage); 711 704 712 705 /* 713 706 * Normally, a device would use the page->zone_device_data to ··· 747 684 748 685 pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n", 749 686 page_to_pfn(spage), page_to_pfn(dpage)); 750 - *dst = migrate_pfn(page_to_pfn(dpage)); 751 - if ((*src & MIGRATE_PFN_WRITE) || 752 - (!spage && args->vma->vm_flags & VM_WRITE)) 753 - *dst |= MIGRATE_PFN_WRITE; 687 + 688 + *dst = migrate_pfn(page_to_pfn(dpage)) | write; 689 + 690 + if (is_large) { 691 + int i; 692 + struct folio *folio = page_folio(dpage); 693 + *dst |= MIGRATE_PFN_COMPOUND; 694 + 695 + if (folio_test_large(folio)) { 696 + for (i = 0; i < folio_nr_pages(folio); i++) { 697 + struct page *dst_page = 698 + pfn_to_page(page_to_pfn(rpage) + i); 699 + struct page *src_page = 700 + pfn_to_page(page_to_pfn(spage) + i); 701 + 702 + if (spage) 703 + copy_highpage(dst_page, src_page); 704 + else 705 + clear_highpage(dst_page); 706 + src++; 707 + dst++; 708 + addr += PAGE_SIZE; 709 + } 710 + continue; 711 + } 712 + } 713 + 714 + if (spage) 715 + copy_highpage(rpage, spage); 716 + else 717 + clear_highpage(rpage); 718 + 719 + next: 720 + src++; 721 + dst++; 722 + addr += PAGE_SIZE; 754 723 } 755 724 } 756 725 ··· 829 734 const unsigned long *src = args->src; 830 735 const unsigned long *dst = args->dst; 831 736 unsigned long pfn; 737 + const unsigned long start_pfn = start >> PAGE_SHIFT; 738 + const unsigned long end_pfn = end >> PAGE_SHIFT; 832 739 833 740 /* Map the migrated pages into the device's page tables. */ 834 741 mutex_lock(&dmirror->mutex); 835 742 836 - for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, 837 - src++, dst++) { 743 + for (pfn = start_pfn; pfn < end_pfn; pfn++, src++, dst++) { 838 744 struct page *dpage; 839 745 void *entry; 746 + int nr, i; 747 + struct page *rpage; 840 748 841 749 if (!(*src & MIGRATE_PFN_MIGRATE)) 842 750 continue; ··· 848 750 if (!dpage) 849 751 continue; 850 752 851 - entry = BACKING_PAGE(dpage); 852 - if (*dst & MIGRATE_PFN_WRITE) 853 - entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 854 - entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); 855 - if (xa_is_err(entry)) { 856 - mutex_unlock(&dmirror->mutex); 857 - return xa_err(entry); 753 + if (*dst & MIGRATE_PFN_COMPOUND) 754 + nr = folio_nr_pages(page_folio(dpage)); 755 + else 756 + nr = 1; 757 + 758 + WARN_ON_ONCE(end_pfn < start_pfn + nr); 759 + 760 + rpage = BACKING_PAGE(dpage); 761 + VM_WARN_ON(folio_nr_pages(page_folio(rpage)) != nr); 762 + 763 + for (i = 0; i < nr; i++) { 764 + entry = folio_page(page_folio(rpage), i); 765 + if (*dst & MIGRATE_PFN_WRITE) 766 + entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); 767 + entry = xa_store(&dmirror->pt, pfn + i, entry, GFP_ATOMIC); 768 + if (xa_is_err(entry)) { 769 + mutex_unlock(&dmirror->mutex); 770 + return xa_err(entry); 771 + } 858 772 } 859 773 } 860 774 ··· 939 829 unsigned long start = args->start; 940 830 unsigned long end = args->end; 941 831 unsigned long addr; 832 + unsigned int order = 0; 833 + int i; 942 834 943 - for (addr = start; addr < end; addr += PAGE_SIZE, 944 - src++, dst++) { 835 + for (addr = start; addr < end; ) { 945 836 struct page *dpage, *spage; 946 837 947 838 spage = migrate_pfn_to_page(*src); 948 - if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) 949 - continue; 839 + if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) { 840 + addr += PAGE_SIZE; 841 + goto next; 842 + } 950 843 951 844 if (WARN_ON(!is_device_private_page(spage) && 952 - !is_device_coherent_page(spage))) 953 - continue; 954 - spage = BACKING_PAGE(spage); 955 - dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); 956 - if (!dpage) 957 - continue; 958 - pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n", 959 - page_to_pfn(spage), page_to_pfn(dpage)); 845 + !is_device_coherent_page(spage))) { 846 + addr += PAGE_SIZE; 847 + goto next; 848 + } 960 849 850 + spage = BACKING_PAGE(spage); 851 + order = folio_order(page_folio(spage)); 852 + 853 + if (order) 854 + dpage = folio_page(vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 855 + order, args->vma, addr), 0); 856 + else 857 + dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); 858 + 859 + /* Try with smaller pages if large allocation fails */ 860 + if (!dpage && order) { 861 + dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); 862 + if (!dpage) 863 + return VM_FAULT_OOM; 864 + order = 0; 865 + } 866 + 867 + pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n", 868 + page_to_pfn(spage), page_to_pfn(dpage)); 961 869 lock_page(dpage); 962 870 xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); 963 871 copy_highpage(dpage, spage); 964 872 *dst = migrate_pfn(page_to_pfn(dpage)); 965 873 if (*src & MIGRATE_PFN_WRITE) 966 874 *dst |= MIGRATE_PFN_WRITE; 875 + if (order) 876 + *dst |= MIGRATE_PFN_COMPOUND; 877 + 878 + for (i = 0; i < (1 << order); i++) { 879 + struct page *src_page; 880 + struct page *dst_page; 881 + 882 + src_page = pfn_to_page(page_to_pfn(spage) + i); 883 + dst_page = pfn_to_page(page_to_pfn(dpage) + i); 884 + 885 + xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); 886 + copy_highpage(dst_page, src_page); 887 + } 888 + next: 889 + addr += PAGE_SIZE << order; 890 + src += 1 << order; 891 + dst += 1 << order; 967 892 } 968 893 return 0; 969 894 } ··· 1024 879 unsigned long size = cmd->npages << PAGE_SHIFT; 1025 880 struct mm_struct *mm = dmirror->notifier.mm; 1026 881 struct vm_area_struct *vma; 1027 - unsigned long src_pfns[32] = { 0 }; 1028 - unsigned long dst_pfns[32] = { 0 }; 1029 882 struct migrate_vma args = { 0 }; 1030 883 unsigned long next; 1031 884 int ret; 885 + unsigned long *src_pfns; 886 + unsigned long *dst_pfns; 887 + 888 + src_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL); 889 + dst_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL); 1032 890 1033 891 start = cmd->addr; 1034 892 end = start + size; ··· 1050 902 ret = -EINVAL; 1051 903 goto out; 1052 904 } 1053 - next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); 905 + next = min(end, addr + (PTRS_PER_PTE << PAGE_SHIFT)); 1054 906 if (next > vma->vm_end) 1055 907 next = vma->vm_end; 1056 908 ··· 1060 912 args.start = addr; 1061 913 args.end = next; 1062 914 args.pgmap_owner = dmirror->mdevice; 1063 - args.flags = dmirror_select_device(dmirror); 915 + args.flags = dmirror_select_device(dmirror) | MIGRATE_VMA_SELECT_COMPOUND; 1064 916 1065 917 ret = migrate_vma_setup(&args); 1066 918 if (ret) ··· 1076 928 out: 1077 929 mmap_read_unlock(mm); 1078 930 mmput(mm); 931 + kvfree(src_pfns); 932 + kvfree(dst_pfns); 1079 933 1080 934 return ret; 1081 935 } ··· 1089 939 unsigned long size = cmd->npages << PAGE_SHIFT; 1090 940 struct mm_struct *mm = dmirror->notifier.mm; 1091 941 struct vm_area_struct *vma; 1092 - unsigned long src_pfns[32] = { 0 }; 1093 - unsigned long dst_pfns[32] = { 0 }; 1094 942 struct dmirror_bounce bounce; 1095 943 struct migrate_vma args = { 0 }; 1096 944 unsigned long next; 1097 945 int ret; 946 + unsigned long *src_pfns = NULL; 947 + unsigned long *dst_pfns = NULL; 1098 948 1099 949 start = cmd->addr; 1100 950 end = start + size; ··· 1105 955 if (!mmget_not_zero(mm)) 1106 956 return -EINVAL; 1107 957 958 + ret = -ENOMEM; 959 + src_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*src_pfns), 960 + GFP_KERNEL | __GFP_NOFAIL); 961 + if (!src_pfns) 962 + goto free_mem; 963 + 964 + dst_pfns = kvcalloc(PTRS_PER_PTE, sizeof(*dst_pfns), 965 + GFP_KERNEL | __GFP_NOFAIL); 966 + if (!dst_pfns) 967 + goto free_mem; 968 + 969 + ret = 0; 1108 970 mmap_read_lock(mm); 1109 971 for (addr = start; addr < end; addr = next) { 1110 972 vma = vma_lookup(mm, addr); ··· 1124 962 ret = -EINVAL; 1125 963 goto out; 1126 964 } 1127 - next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); 965 + next = min(end, addr + (PTRS_PER_PTE << PAGE_SHIFT)); 1128 966 if (next > vma->vm_end) 1129 967 next = vma->vm_end; 1130 968 ··· 1134 972 args.start = addr; 1135 973 args.end = next; 1136 974 args.pgmap_owner = dmirror->mdevice; 1137 - args.flags = MIGRATE_VMA_SELECT_SYSTEM; 975 + args.flags = MIGRATE_VMA_SELECT_SYSTEM | 976 + MIGRATE_VMA_SELECT_COMPOUND; 1138 977 ret = migrate_vma_setup(&args); 1139 978 if (ret) 1140 979 goto out; ··· 1155 992 */ 1156 993 ret = dmirror_bounce_init(&bounce, start, size); 1157 994 if (ret) 1158 - return ret; 995 + goto free_mem; 1159 996 mutex_lock(&dmirror->mutex); 1160 997 ret = dmirror_do_read(dmirror, start, end, &bounce); 1161 998 mutex_unlock(&dmirror->mutex); ··· 1166 1003 } 1167 1004 cmd->cpages = bounce.cpages; 1168 1005 dmirror_bounce_fini(&bounce); 1169 - return ret; 1006 + goto free_mem; 1170 1007 1171 1008 out: 1172 1009 mmap_read_unlock(mm); 1173 1010 mmput(mm); 1011 + free_mem: 1012 + kfree(src_pfns); 1013 + kfree(dst_pfns); 1174 1014 return ret; 1175 1015 } 1176 1016 ··· 1366 1200 unsigned long i; 1367 1201 unsigned long *src_pfns; 1368 1202 unsigned long *dst_pfns; 1203 + unsigned int order = 0; 1369 1204 1370 1205 src_pfns = kvcalloc(npages, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL); 1371 1206 dst_pfns = kvcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL); ··· 1382 1215 if (WARN_ON(!is_device_private_page(spage) && 1383 1216 !is_device_coherent_page(spage))) 1384 1217 continue; 1218 + 1219 + order = folio_order(page_folio(spage)); 1385 1220 spage = BACKING_PAGE(spage); 1386 - dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL); 1221 + if (src_pfns[i] & MIGRATE_PFN_COMPOUND) { 1222 + dpage = folio_page(folio_alloc(GFP_HIGHUSER_MOVABLE, 1223 + order), 0); 1224 + } else { 1225 + dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL); 1226 + order = 0; 1227 + } 1228 + 1229 + /* TODO Support splitting here */ 1387 1230 lock_page(dpage); 1388 - copy_highpage(dpage, spage); 1389 1231 dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)); 1390 1232 if (src_pfns[i] & MIGRATE_PFN_WRITE) 1391 1233 dst_pfns[i] |= MIGRATE_PFN_WRITE; 1234 + if (order) 1235 + dst_pfns[i] |= MIGRATE_PFN_COMPOUND; 1236 + folio_copy(page_folio(dpage), page_folio(spage)); 1392 1237 } 1393 1238 migrate_device_pages(src_pfns, dst_pfns, npages); 1394 1239 migrate_device_finalize(src_pfns, dst_pfns, npages); ··· 1413 1234 { 1414 1235 struct dmirror_device *mdevice = devmem->mdevice; 1415 1236 struct page *page; 1237 + struct folio *folio; 1416 1238 1239 + 1240 + for (folio = mdevice->free_folios; folio; folio = folio_zone_device_data(folio)) 1241 + if (dmirror_page_to_chunk(folio_page(folio, 0)) == devmem) 1242 + mdevice->free_folios = folio_zone_device_data(folio); 1417 1243 for (page = mdevice->free_pages; page; page = page->zone_device_data) 1418 1244 if (dmirror_page_to_chunk(page) == devmem) 1419 1245 mdevice->free_pages = page->zone_device_data; ··· 1449 1265 mdevice->devmem_count = 0; 1450 1266 mdevice->devmem_capacity = 0; 1451 1267 mdevice->free_pages = NULL; 1268 + mdevice->free_folios = NULL; 1452 1269 kfree(mdevice->devmem_chunks); 1453 1270 mdevice->devmem_chunks = NULL; 1454 1271 } ··· 1564 1379 struct page *page = &folio->page; 1565 1380 struct page *rpage = BACKING_PAGE(page); 1566 1381 struct dmirror_device *mdevice; 1382 + struct folio *rfolio = page_folio(rpage); 1383 + unsigned int order = folio_order(rfolio); 1567 1384 1568 - if (rpage != page) 1569 - __free_page(rpage); 1385 + if (rpage != page) { 1386 + if (order) 1387 + __free_pages(rpage, order); 1388 + else 1389 + __free_page(rpage); 1390 + rpage = NULL; 1391 + } 1570 1392 1571 1393 mdevice = dmirror_page_to_device(page); 1572 1394 spin_lock(&mdevice->lock); 1573 1395 1574 1396 /* Return page to our allocator if not freeing the chunk */ 1575 1397 if (!dmirror_page_to_chunk(page)->remove) { 1576 - mdevice->cfree++; 1577 - page->zone_device_data = mdevice->free_pages; 1578 - mdevice->free_pages = page; 1398 + mdevice->cfree += 1 << order; 1399 + if (order) { 1400 + page->zone_device_data = mdevice->free_folios; 1401 + mdevice->free_folios = page_folio(page); 1402 + } else { 1403 + page->zone_device_data = mdevice->free_pages; 1404 + mdevice->free_pages = page; 1405 + } 1579 1406 } 1580 1407 spin_unlock(&mdevice->lock); 1581 1408 } ··· 1595 1398 static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) 1596 1399 { 1597 1400 struct migrate_vma args = { 0 }; 1598 - unsigned long src_pfns = 0; 1599 - unsigned long dst_pfns = 0; 1600 1401 struct page *rpage; 1601 1402 struct dmirror *dmirror; 1602 - vm_fault_t ret; 1403 + vm_fault_t ret = 0; 1404 + unsigned int order, nr; 1603 1405 1604 1406 /* 1605 1407 * Normally, a device would use the page->zone_device_data to point to 1606 1408 * the mirror but here we use it to hold the page for the simulated 1607 1409 * device memory and that page holds the pointer to the mirror. 1608 1410 */ 1609 - rpage = vmf->page->zone_device_data; 1411 + rpage = folio_zone_device_data(page_folio(vmf->page)); 1610 1412 dmirror = rpage->zone_device_data; 1611 1413 1612 1414 /* FIXME demonstrate how we can adjust migrate range */ 1415 + order = folio_order(page_folio(vmf->page)); 1416 + nr = 1 << order; 1417 + 1418 + /* 1419 + * Consider a per-cpu cache of src and dst pfns, but with 1420 + * large number of cpus that might not scale well. 1421 + */ 1422 + args.start = ALIGN_DOWN(vmf->address, (PAGE_SIZE << order)); 1613 1423 args.vma = vmf->vma; 1614 - args.start = vmf->address; 1615 - args.end = args.start + PAGE_SIZE; 1616 - args.src = &src_pfns; 1617 - args.dst = &dst_pfns; 1424 + args.end = args.start + (PAGE_SIZE << order); 1425 + 1426 + nr = (args.end - args.start) >> PAGE_SHIFT; 1427 + args.src = kcalloc(nr, sizeof(unsigned long), GFP_KERNEL); 1428 + args.dst = kcalloc(nr, sizeof(unsigned long), GFP_KERNEL); 1618 1429 args.pgmap_owner = dmirror->mdevice; 1619 1430 args.flags = dmirror_select_device(dmirror); 1620 1431 args.fault_page = vmf->page; 1432 + 1433 + if (!args.src || !args.dst) { 1434 + ret = VM_FAULT_OOM; 1435 + goto err; 1436 + } 1437 + 1438 + if (order) 1439 + args.flags |= MIGRATE_VMA_SELECT_COMPOUND; 1621 1440 1622 1441 if (migrate_vma_setup(&args)) 1623 1442 return VM_FAULT_SIGBUS; 1624 1443 1625 1444 ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror); 1626 1445 if (ret) 1627 - return ret; 1446 + goto err; 1628 1447 migrate_vma_pages(&args); 1629 1448 /* 1630 1449 * No device finalize step is needed since ··· 1648 1435 * invalidated the device page table. 1649 1436 */ 1650 1437 migrate_vma_finalize(&args); 1651 - return 0; 1438 + err: 1439 + kfree(args.src); 1440 + kfree(args.dst); 1441 + return ret; 1652 1442 } 1653 1443 1654 1444 static const struct dev_pagemap_ops dmirror_devmem_ops = { ··· 1682 1466 return ret; 1683 1467 1684 1468 /* Build a list of free ZONE_DEVICE struct pages */ 1685 - return dmirror_allocate_chunk(mdevice, NULL); 1469 + return dmirror_allocate_chunk(mdevice, NULL, false); 1686 1470 } 1687 1471 1688 1472 static void dmirror_device_remove(struct dmirror_device *mdevice)