Merge branch 'akpm' (patches from Andrew)

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
"24 patches, based on 4a09d388f2ab382f217a764e6a152b3f614246f6.

Subsystems affected by this patch series: mm (thp, vmalloc, hugetlb,
memory-failure, and pagealloc), nilfs2, kthread, MAINTAINERS, and
mailmap"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (24 commits)
mailmap: add Marek's other e-mail address and identity without diacritics
MAINTAINERS: fix Marek's identity again
mm/page_alloc: do bulk array bounds check after checking populated elements
mm/page_alloc: __alloc_pages_bulk(): do bounds check before accessing array
mm/hwpoison: do not lock page again when me_huge_page() successfully recovers
mm,hwpoison: return -EHWPOISON to denote that the page has already been poisoned
mm/memory-failure: use a mutex to avoid memory_failure() races
mm, futex: fix shared futex pgoff on shmem huge page
kthread: prevent deadlock when kthread_mod_delayed_work() races with kthread_cancel_delayed_work_sync()
kthread_worker: split code for canceling the delayed work timer
mm/vmalloc: unbreak kasan vmalloc support
KVM: s390: prepare for hugepage vmalloc
mm/vmalloc: add vmalloc_no_huge
nilfs2: fix memory leak in nilfs_sysfs_delete_device_group
mm/thp: another PVMW_SYNC fix in page_vma_mapped_walk()
mm/thp: fix page_vma_mapped_walk() if THP mapped by ptes
mm: page_vma_mapped_walk(): get vma_address_end() earlier
mm: page_vma_mapped_walk(): use goto instead of while (1)
mm: page_vma_mapped_walk(): add a level of indentation
mm: page_vma_mapped_walk(): crossing page table boundary
...

Linus Torvalds 5 years ago 7ce32ac6 808e9df4

+261 -163

14 changed files

expand all collapse all

.mailmap

MAINTAINERS

arch

s390

kvm

pv.c

nilfs2

sysfs.c

include

linux

hugetlb.h

pagemap.h

vmalloc.h

kernel

futex.c

kthread.c

hugetlb.c

memory-failure.c

page_alloc.c

page_vma_mapped.c

vmalloc.c

.mailmap

reviewed

··· 212 212 Manivannan Sadhasivam <mani@kernel.org> <manivannan.sadhasivam@linaro.org> 213 213 Marcin Nowakowski <marcin.nowakowski@mips.com> <marcin.nowakowski@imgtec.com> 214 214 Marc Zyngier <maz@kernel.org> <marc.zyngier@arm.com> 215 215 + Marek Behún <kabel@kernel.org> <marek.behun@nic.cz> 216 216 + Marek Behún <kabel@kernel.org> Marek Behun <marek.behun@nic.cz> 215 217 Mark Brown <broonie@sirena.org.uk> 216 218 Mark Starovoytov <mstarovo@pm.me> <mstarovoitov@marvell.com> 217 219 Mark Yao <markyao0591@gmail.com> <mark.yao@rock-chips.com>

+2 -2

MAINTAINERS

reviewed

··· 1816 1816 F: drivers/rtc/rtc-ftrtc010.c 1817 1817 1818 1818 ARM/CZ.NIC TURRIS SUPPORT 1819 1819 - M: Marek Behun <kabel@kernel.org> 1819 1819 + M: Marek Behún <kabel@kernel.org> 1820 1820 S: Maintained 1821 1821 W: https://www.turris.cz/ 1822 1822 F: Documentation/ABI/testing/debugfs-moxtet ··· 10945 10945 10946 10946 MARVELL MV88X3310 PHY DRIVER 10947 10947 M: Russell King <linux@armlinux.org.uk> 10948 10948 - M: Marek Behun <marek.behun@nic.cz> 10948 10948 + M: Marek Behún <kabel@kernel.org> 10949 10949 L: netdev@vger.kernel.org 10950 10950 S: Maintained 10951 10951 F: drivers/net/phy/marvell10g.c

+6 -1

arch/s390/kvm/pv.c

reviewed

··· 140 140 /* Allocate variable storage */ 141 141 vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE); 142 142 vlen += uv_info.guest_virt_base_stor_len; 143 143 - kvm->arch.pv.stor_var = vzalloc(vlen); 143 143 + /* 144 144 + * The Create Secure Configuration Ultravisor Call does not support 145 145 + * using large pages for the virtual memory area. 146 146 + * This is a hardware limitation. 147 147 + */ 148 148 + kvm->arch.pv.stor_var = vmalloc_no_huge(vlen); 144 149 if (!kvm->arch.pv.stor_var) 145 150 goto out_err; 146 151 return 0;

fs/nilfs2/sysfs.c

reviewed

··· 1053 1053 nilfs_sysfs_delete_superblock_group(nilfs); 1054 1054 nilfs_sysfs_delete_segctor_group(nilfs); 1055 1055 kobject_del(&nilfs->ns_dev_kobj); 1056 1056 + kobject_put(&nilfs->ns_dev_kobj); 1056 1057 kfree(nilfs->ns_dev_subgroups); 1057 1058 } 1058 1059

-16

include/linux/hugetlb.h

reviewed

··· 741 741 return h - hstates; 742 742 } 743 743 744 744 - pgoff_t __basepage_index(struct page *page); 745 745 - 746 746 - /* Return page->index in PAGE_SIZE units */ 747 747 - static inline pgoff_t basepage_index(struct page *page) 748 748 - { 749 749 - if (!PageCompound(page)) 750 750 - return page->index; 751 751 - 752 752 - return __basepage_index(page); 753 753 - } 754 754 - 755 744 extern int dissolve_free_huge_page(struct page *page); 756 745 extern int dissolve_free_huge_pages(unsigned long start_pfn, 757 746 unsigned long end_pfn); ··· 975 986 static inline int hstate_index(struct hstate *h) 976 987 { 977 988 return 0; 978 978 - } 979 979 - 980 980 - static inline pgoff_t basepage_index(struct page *page) 981 981 - { 982 982 - return page->index; 983 989 } 984 990 985 991 static inline int dissolve_free_huge_page(struct page *page)

+7 -6

include/linux/pagemap.h

reviewed

··· 516 516 } 517 517 518 518 /* 519 519 - * Get index of the page with in radix-tree 519 519 + * Get index of the page within radix-tree (but not for hugetlb pages). 520 520 * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE) 521 521 */ 522 522 static inline pgoff_t page_to_index(struct page *page) ··· 535 535 return pgoff; 536 536 } 537 537 538 538 + extern pgoff_t hugetlb_basepage_index(struct page *page); 539 539 + 538 540 /* 539 539 - * Get the offset in PAGE_SIZE. 540 540 - * (TODO: hugepage should have ->index in PAGE_SIZE) 541 541 + * Get the offset in PAGE_SIZE (even for hugetlb pages). 542 542 + * (TODO: hugetlb pages should have ->index in PAGE_SIZE) 541 543 */ 542 544 static inline pgoff_t page_to_pgoff(struct page *page) 543 545 { 544 544 - if (unlikely(PageHeadHuge(page))) 545 545 - return page->index << compound_order(page); 546 546 - 546 546 + if (unlikely(PageHuge(page))) 547 547 + return hugetlb_basepage_index(page); 547 548 return page_to_index(page); 548 549 } 549 550

include/linux/vmalloc.h

reviewed

··· 135 135 const void *caller); 136 136 void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, 137 137 int node, const void *caller); 138 138 + void *vmalloc_no_huge(unsigned long size); 138 139 139 140 extern void vfree(const void *addr); 140 141 extern void vfree_atomic(const void *addr);

+1 -2

kernel/futex.c

reviewed

··· 35 35 #include <linux/jhash.h> 36 36 #include <linux/pagemap.h> 37 37 #include <linux/syscalls.h> 38 38 - #include <linux/hugetlb.h> 39 38 #include <linux/freezer.h> 40 39 #include <linux/memblock.h> 41 40 #include <linux/fault-inject.h> ··· 649 650 650 651 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 651 652 key->shared.i_seq = get_inode_sequence_number(inode); 652 652 - key->shared.pgoff = basepage_index(tail); 653 653 + key->shared.pgoff = page_to_pgoff(tail); 653 654 rcu_read_unlock(); 654 655 } 655 656

+51 -26

kernel/kthread.c

reviewed

··· 1093 1093 EXPORT_SYMBOL_GPL(kthread_flush_work); 1094 1094 1095 1095 /* 1096 1096 - * This function removes the work from the worker queue. Also it makes sure 1097 1097 - * that it won't get queued later via the delayed work's timer. 1096 1096 + * Make sure that the timer is neither set nor running and could 1097 1097 + * not manipulate the work list_head any longer. 1098 1098 + * 1099 1099 + * The function is called under worker->lock. The lock is temporary 1100 1100 + * released but the timer can't be set again in the meantime. 1101 1101 + */ 1102 1102 + static void kthread_cancel_delayed_work_timer(struct kthread_work *work, 1103 1103 + unsigned long *flags) 1104 1104 + { 1105 1105 + struct kthread_delayed_work *dwork = 1106 1106 + container_of(work, struct kthread_delayed_work, work); 1107 1107 + struct kthread_worker *worker = work->worker; 1108 1108 + 1109 1109 + /* 1110 1110 + * del_timer_sync() must be called to make sure that the timer 1111 1111 + * callback is not running. The lock must be temporary released 1112 1112 + * to avoid a deadlock with the callback. In the meantime, 1113 1113 + * any queuing is blocked by setting the canceling counter. 1114 1114 + */ 1115 1115 + work->canceling++; 1116 1116 + raw_spin_unlock_irqrestore(&worker->lock, *flags); 1117 1117 + del_timer_sync(&dwork->timer); 1118 1118 + raw_spin_lock_irqsave(&worker->lock, *flags); 1119 1119 + work->canceling--; 1120 1120 + } 1121 1121 + 1122 1122 + /* 1123 1123 + * This function removes the work from the worker queue. 1124 1124 + * 1125 1125 + * It is called under worker->lock. The caller must make sure that 1126 1126 + * the timer used by delayed work is not running, e.g. by calling 1127 1127 + * kthread_cancel_delayed_work_timer(). 1098 1128 * 1099 1129 * The work might still be in use when this function finishes. See the 1100 1130 * current_work proceed by the worker. ··· 1132 1102 * Return: %true if @work was pending and successfully canceled, 1133 1103 * %false if @work was not pending 1134 1104 */ 1135 1135 - static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork, 1136 1136 - unsigned long *flags) 1105 1105 + static bool __kthread_cancel_work(struct kthread_work *work) 1137 1106 { 1138 1138 - /* Try to cancel the timer if exists. */ 1139 1139 - if (is_dwork) { 1140 1140 - struct kthread_delayed_work *dwork = 1141 1141 - container_of(work, struct kthread_delayed_work, work); 1142 1142 - struct kthread_worker *worker = work->worker; 1143 1143 - 1144 1144 - /* 1145 1145 - * del_timer_sync() must be called to make sure that the timer 1146 1146 - * callback is not running. The lock must be temporary released 1147 1147 - * to avoid a deadlock with the callback. In the meantime, 1148 1148 - * any queuing is blocked by setting the canceling counter. 1149 1149 - */ 1150 1150 - work->canceling++; 1151 1151 - raw_spin_unlock_irqrestore(&worker->lock, *flags); 1152 1152 - del_timer_sync(&dwork->timer); 1153 1153 - raw_spin_lock_irqsave(&worker->lock, *flags); 1154 1154 - work->canceling--; 1155 1155 - } 1156 1156 - 1157 1107 /* 1158 1108 * Try to remove the work from a worker list. It might either 1159 1109 * be from worker->work_list or from worker->delayed_work_list. ··· 1186 1176 /* Work must not be used with >1 worker, see kthread_queue_work() */ 1187 1177 WARN_ON_ONCE(work->worker != worker); 1188 1178 1189 1189 - /* Do not fight with another command that is canceling this work. */ 1179 1179 + /* 1180 1180 + * Temporary cancel the work but do not fight with another command 1181 1181 + * that is canceling the work as well. 1182 1182 + * 1183 1183 + * It is a bit tricky because of possible races with another 1184 1184 + * mod_delayed_work() and cancel_delayed_work() callers. 1185 1185 + * 1186 1186 + * The timer must be canceled first because worker->lock is released 1187 1187 + * when doing so. But the work can be removed from the queue (list) 1188 1188 + * only when it can be queued again so that the return value can 1189 1189 + * be used for reference counting. 1190 1190 + */ 1191 1191 + kthread_cancel_delayed_work_timer(work, &flags); 1190 1192 if (work->canceling) 1191 1193 goto out; 1194 1194 + ret = __kthread_cancel_work(work); 1192 1195 1193 1193 - ret = __kthread_cancel_work(work, true, &flags); 1194 1196 fast_queue: 1195 1197 __kthread_queue_delayed_work(worker, dwork, delay); 1196 1198 out: ··· 1224 1202 /* Work must not be used with >1 worker, see kthread_queue_work(). */ 1225 1203 WARN_ON_ONCE(work->worker != worker); 1226 1204 1227 1227 - ret = __kthread_cancel_work(work, is_dwork, &flags); 1205 1205 + if (is_dwork) 1206 1206 + kthread_cancel_delayed_work_timer(work, &flags); 1207 1207 + 1208 1208 + ret = __kthread_cancel_work(work); 1228 1209 1229 1210 if (worker->current_work != work) 1230 1211 goto out_fast;

+1 -4

mm/hugetlb.c

reviewed

··· 1588 1588 return NULL; 1589 1589 } 1590 1590 1591 1591 - pgoff_t __basepage_index(struct page *page) 1591 1591 + pgoff_t hugetlb_basepage_index(struct page *page) 1592 1592 { 1593 1593 struct page *page_head = compound_head(page); 1594 1594 pgoff_t index = page_index(page_head); 1595 1595 unsigned long compound_idx; 1596 1596 - 1597 1597 - if (!PageHuge(page_head)) 1598 1598 - return page_index(page); 1599 1596 1600 1597 if (compound_order(page_head) >= MAX_ORDER) 1601 1598 compound_idx = page_to_pfn(page) - page_to_pfn(page_head);

+55 -28

mm/memory-failure.c

reviewed

··· 658 658 */ 659 659 static int me_kernel(struct page *p, unsigned long pfn) 660 660 { 661 661 + unlock_page(p); 661 662 return MF_IGNORED; 662 663 } 663 664 ··· 668 667 static int me_unknown(struct page *p, unsigned long pfn) 669 668 { 670 669 pr_err("Memory failure: %#lx: Unknown page state\n", pfn); 670 670 + unlock_page(p); 671 671 return MF_FAILED; 672 672 } 673 673 ··· 677 675 */ 678 676 static int me_pagecache_clean(struct page *p, unsigned long pfn) 679 677 { 678 678 + int ret; 680 679 struct address_space *mapping; 681 680 682 681 delete_from_lru_cache(p); ··· 686 683 * For anonymous pages we're done the only reference left 687 684 * should be the one m_f() holds. 688 685 */ 689 689 - if (PageAnon(p)) 690 690 - return MF_RECOVERED; 686 686 + if (PageAnon(p)) { 687 687 + ret = MF_RECOVERED; 688 688 + goto out; 689 689 + } 691 690 692 691 /* 693 692 * Now truncate the page in the page cache. This is really ··· 703 698 /* 704 699 * Page has been teared down in the meanwhile 705 700 */ 706 706 - return MF_FAILED; 701 701 + ret = MF_FAILED; 702 702 + goto out; 707 703 } 708 704 709 705 /* ··· 712 706 * 713 707 * Open: to take i_mutex or not for this? Right now we don't. 714 708 */ 715 715 - return truncate_error_page(p, pfn, mapping); 709 709 + ret = truncate_error_page(p, pfn, mapping); 710 710 + out: 711 711 + unlock_page(p); 712 712 + return ret; 716 713 } 717 714 718 715 /* ··· 791 782 */ 792 783 static int me_swapcache_dirty(struct page *p, unsigned long pfn) 793 784 { 785 785 + int ret; 786 786 + 794 787 ClearPageDirty(p); 795 788 /* Trigger EIO in shmem: */ 796 789 ClearPageUptodate(p); 797 790 798 798 - if (!delete_from_lru_cache(p)) 799 799 - return MF_DELAYED; 800 800 - else 801 801 - return MF_FAILED; 791 791 + ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED; 792 792 + unlock_page(p); 793 793 + return ret; 802 794 } 803 795 804 796 static int me_swapcache_clean(struct page *p, unsigned long pfn) 805 797 { 798 798 + int ret; 799 799 + 806 800 delete_from_swap_cache(p); 807 801 808 808 - if (!delete_from_lru_cache(p)) 809 809 - return MF_RECOVERED; 810 810 - else 811 811 - return MF_FAILED; 802 802 + ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED; 803 803 + unlock_page(p); 804 804 + return ret; 812 805 } 813 806 814 807 /* ··· 831 820 mapping = page_mapping(hpage); 832 821 if (mapping) { 833 822 res = truncate_error_page(hpage, pfn, mapping); 823 823 + unlock_page(hpage); 834 824 } else { 835 825 res = MF_FAILED; 836 826 unlock_page(hpage); ··· 846 834 page_ref_inc(p); 847 835 res = MF_RECOVERED; 848 836 } 849 849 - lock_page(hpage); 850 837 } 851 838 852 839 return res; ··· 877 866 unsigned long mask; 878 867 unsigned long res; 879 868 enum mf_action_page_type type; 869 869 + 870 870 + /* Callback ->action() has to unlock the relevant page inside it. */ 880 871 int (*action)(struct page *p, unsigned long pfn); 881 872 } error_states[] = { 882 873 { reserved, reserved, MF_MSG_KERNEL, me_kernel }, ··· 942 929 int result; 943 930 int count; 944 931 932 932 + /* page p should be unlocked after returning from ps->action(). */ 945 933 result = ps->action(p, pfn); 946 934 947 935 count = page_count(p) - 1; ··· 1267 1253 if (TestSetPageHWPoison(head)) { 1268 1254 pr_err("Memory failure: %#lx: already hardware poisoned\n", 1269 1255 pfn); 1270 1270 - return 0; 1256 1256 + return -EHWPOISON; 1271 1257 } 1272 1258 1273 1259 num_poisoned_pages_inc(); ··· 1327 1313 goto out; 1328 1314 } 1329 1315 1330 1330 - res = identify_page_state(pfn, p, page_flags); 1316 1316 + return identify_page_state(pfn, p, page_flags); 1331 1317 out: 1332 1318 unlock_page(head); 1333 1319 return res; ··· 1443 1429 struct page *hpage; 1444 1430 struct page *orig_head; 1445 1431 struct dev_pagemap *pgmap; 1446 1446 - int res; 1432 1432 + int res = 0; 1447 1433 unsigned long page_flags; 1448 1434 bool retry = true; 1435 1435 + static DEFINE_MUTEX(mf_mutex); 1449 1436 1450 1437 if (!sysctl_memory_failure_recovery) 1451 1438 panic("Memory failure on page %lx", pfn); ··· 1464 1449 return -ENXIO; 1465 1450 } 1466 1451 1452 1452 + mutex_lock(&mf_mutex); 1453 1453 + 1467 1454 try_again: 1468 1468 - if (PageHuge(p)) 1469 1469 - return memory_failure_hugetlb(pfn, flags); 1455 1455 + if (PageHuge(p)) { 1456 1456 + res = memory_failure_hugetlb(pfn, flags); 1457 1457 + goto unlock_mutex; 1458 1458 + } 1459 1459 + 1470 1460 if (TestSetPageHWPoison(p)) { 1471 1461 pr_err("Memory failure: %#lx: already hardware poisoned\n", 1472 1462 pfn); 1473 1473 - return 0; 1463 1463 + res = -EHWPOISON; 1464 1464 + goto unlock_mutex; 1474 1465 } 1475 1466 1476 1467 orig_head = hpage = compound_head(p); ··· 1509 1488 res = MF_FAILED; 1510 1489 } 1511 1490 action_result(pfn, MF_MSG_BUDDY, res); 1512 1512 - return res == MF_RECOVERED ? 0 : -EBUSY; 1491 1491 + res = res == MF_RECOVERED ? 0 : -EBUSY; 1513 1492 } else { 1514 1493 action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); 1515 1515 - return -EBUSY; 1494 1494 + res = -EBUSY; 1516 1495 } 1496 1496 + goto unlock_mutex; 1517 1497 } 1518 1498 1519 1499 if (PageTransHuge(hpage)) { 1520 1500 if (try_to_split_thp_page(p, "Memory Failure") < 0) { 1521 1501 action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); 1522 1522 - return -EBUSY; 1502 1502 + res = -EBUSY; 1503 1503 + goto unlock_mutex; 1523 1504 } 1524 1505 VM_BUG_ON_PAGE(!page_count(p), p); 1525 1506 } ··· 1545 1522 if (PageCompound(p) && compound_head(p) != orig_head) { 1546 1523 action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); 1547 1524 res = -EBUSY; 1548 1548 - goto out; 1525 1525 + goto unlock_page; 1549 1526 } 1550 1527 1551 1528 /* ··· 1565 1542 num_poisoned_pages_dec(); 1566 1543 unlock_page(p); 1567 1544 put_page(p); 1568 1568 - return 0; 1545 1545 + goto unlock_mutex; 1569 1546 } 1570 1547 if (hwpoison_filter(p)) { 1571 1548 if (TestClearPageHWPoison(p)) 1572 1549 num_poisoned_pages_dec(); 1573 1550 unlock_page(p); 1574 1551 put_page(p); 1575 1575 - return 0; 1552 1552 + goto unlock_mutex; 1576 1553 } 1577 1554 1578 1555 /* ··· 1596 1573 if (!hwpoison_user_mappings(p, pfn, flags, &p)) { 1597 1574 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); 1598 1575 res = -EBUSY; 1599 1599 - goto out; 1576 1576 + goto unlock_page; 1600 1577 } 1601 1578 1602 1579 /* ··· 1605 1582 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { 1606 1583 action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); 1607 1584 res = -EBUSY; 1608 1608 - goto out; 1585 1585 + goto unlock_page; 1609 1586 } 1610 1587 1611 1588 identify_page_state: 1612 1589 res = identify_page_state(pfn, p, page_flags); 1613 1613 - out: 1590 1590 + mutex_unlock(&mf_mutex); 1591 1591 + return res; 1592 1592 + unlock_page: 1614 1593 unlock_page(p); 1594 1594 + unlock_mutex: 1595 1595 + mutex_unlock(&mf_mutex); 1615 1596 return res; 1616 1597 } 1617 1598 EXPORT_SYMBOL_GPL(memory_failure);

+5 -1

mm/page_alloc.c

reviewed

··· 5053 5053 * Skip populated array elements to determine if any pages need 5054 5054 * to be allocated before disabling IRQs. 5055 5055 */ 5056 5056 - while (page_array && page_array[nr_populated] && nr_populated < nr_pages) 5056 5056 + while (page_array && nr_populated < nr_pages && page_array[nr_populated]) 5057 5057 nr_populated++; 5058 5058 + 5059 5059 + /* Already populated array? */ 5060 5060 + if (unlikely(page_array && nr_pages - nr_populated == 0)) 5061 5061 + return 0; 5058 5062 5059 5063 /* Use the single page allocator for one page. */ 5060 5064 if (nr_pages - nr_populated == 1)

+98 -67

mm/page_vma_mapped.c

reviewed

··· 116 116 return pfn_is_match(pvmw->page, pfn); 117 117 } 118 118 119 119 + static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size) 120 120 + { 121 121 + pvmw->address = (pvmw->address + size) & ~(size - 1); 122 122 + if (!pvmw->address) 123 123 + pvmw->address = ULONG_MAX; 124 124 + } 125 125 + 119 126 /** 120 127 * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at 121 128 * @pvmw->address ··· 151 144 { 152 145 struct mm_struct *mm = pvmw->vma->vm_mm; 153 146 struct page *page = pvmw->page; 147 147 + unsigned long end; 154 148 pgd_t *pgd; 155 149 p4d_t *p4d; 156 150 pud_t *pud; ··· 161 153 if (pvmw->pmd && !pvmw->pte) 162 154 return not_found(pvmw); 163 155 164 164 - if (pvmw->pte) 165 165 - goto next_pte; 156 156 + if (unlikely(PageHuge(page))) { 157 157 + /* The only possible mapping was handled on last iteration */ 158 158 + if (pvmw->pte) 159 159 + return not_found(pvmw); 166 160 167 167 - if (unlikely(PageHuge(pvmw->page))) { 168 161 /* when pud is not present, pte will be NULL */ 169 162 pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page)); 170 163 if (!pvmw->pte) ··· 177 168 return not_found(pvmw); 178 169 return true; 179 170 } 180 180 - restart: 181 181 - pgd = pgd_offset(mm, pvmw->address); 182 182 - if (!pgd_present(*pgd)) 183 183 - return false; 184 184 - p4d = p4d_offset(pgd, pvmw->address); 185 185 - if (!p4d_present(*p4d)) 186 186 - return false; 187 187 - pud = pud_offset(p4d, pvmw->address); 188 188 - if (!pud_present(*pud)) 189 189 - return false; 190 190 - pvmw->pmd = pmd_offset(pud, pvmw->address); 191 191 - /* 192 192 - * Make sure the pmd value isn't cached in a register by the 193 193 - * compiler and used as a stale value after we've observed a 194 194 - * subsequent update. 195 195 - */ 196 196 - pmde = READ_ONCE(*pvmw->pmd); 197 197 - if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { 198 198 - pvmw->ptl = pmd_lock(mm, pvmw->pmd); 199 199 - if (likely(pmd_trans_huge(*pvmw->pmd))) { 200 200 - if (pvmw->flags & PVMW_MIGRATION) 201 201 - return not_found(pvmw); 202 202 - if (pmd_page(*pvmw->pmd) != page) 203 203 - return not_found(pvmw); 204 204 - return true; 205 205 - } else if (!pmd_present(*pvmw->pmd)) { 206 206 - if (thp_migration_supported()) { 207 207 - if (!(pvmw->flags & PVMW_MIGRATION)) 208 208 - return not_found(pvmw); 209 209 - if (is_migration_entry(pmd_to_swp_entry(*pvmw->pmd))) { 210 210 - swp_entry_t entry = pmd_to_swp_entry(*pvmw->pmd); 211 171 212 212 - if (migration_entry_to_page(entry) != page) 213 213 - return not_found(pvmw); 214 214 - return true; 215 215 - } 172 172 + /* 173 173 + * Seek to next pte only makes sense for THP. 174 174 + * But more important than that optimization, is to filter out 175 175 + * any PageKsm page: whose page->index misleads vma_address() 176 176 + * and vma_address_end() to disaster. 177 177 + */ 178 178 + end = PageTransCompound(page) ? 179 179 + vma_address_end(page, pvmw->vma) : 180 180 + pvmw->address + PAGE_SIZE; 181 181 + if (pvmw->pte) 182 182 + goto next_pte; 183 183 + restart: 184 184 + do { 185 185 + pgd = pgd_offset(mm, pvmw->address); 186 186 + if (!pgd_present(*pgd)) { 187 187 + step_forward(pvmw, PGDIR_SIZE); 188 188 + continue; 189 189 + } 190 190 + p4d = p4d_offset(pgd, pvmw->address); 191 191 + if (!p4d_present(*p4d)) { 192 192 + step_forward(pvmw, P4D_SIZE); 193 193 + continue; 194 194 + } 195 195 + pud = pud_offset(p4d, pvmw->address); 196 196 + if (!pud_present(*pud)) { 197 197 + step_forward(pvmw, PUD_SIZE); 198 198 + continue; 199 199 + } 200 200 + 201 201 + pvmw->pmd = pmd_offset(pud, pvmw->address); 202 202 + /* 203 203 + * Make sure the pmd value isn't cached in a register by the 204 204 + * compiler and used as a stale value after we've observed a 205 205 + * subsequent update. 206 206 + */ 207 207 + pmde = READ_ONCE(*pvmw->pmd); 208 208 + 209 209 + if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { 210 210 + pvmw->ptl = pmd_lock(mm, pvmw->pmd); 211 211 + pmde = *pvmw->pmd; 212 212 + if (likely(pmd_trans_huge(pmde))) { 213 213 + if (pvmw->flags & PVMW_MIGRATION) 214 214 + return not_found(pvmw); 215 215 + if (pmd_page(pmde) != page) 216 216 + return not_found(pvmw); 217 217 + return true; 216 218 } 217 217 - return not_found(pvmw); 218 218 - } else { 219 219 + if (!pmd_present(pmde)) { 220 220 + swp_entry_t entry; 221 221 + 222 222 + if (!thp_migration_supported() || 223 223 + !(pvmw->flags & PVMW_MIGRATION)) 224 224 + return not_found(pvmw); 225 225 + entry = pmd_to_swp_entry(pmde); 226 226 + if (!is_migration_entry(entry) || 227 227 + migration_entry_to_page(entry) != page) 228 228 + return not_found(pvmw); 229 229 + return true; 230 230 + } 219 231 /* THP pmd was split under us: handle on pte level */ 220 232 spin_unlock(pvmw->ptl); 221 233 pvmw->ptl = NULL; 222 222 - } 223 223 - } else if (!pmd_present(pmde)) { 224 224 - /* 225 225 - * If PVMW_SYNC, take and drop THP pmd lock so that we 226 226 - * cannot return prematurely, while zap_huge_pmd() has 227 227 - * cleared *pmd but not decremented compound_mapcount(). 228 228 - */ 229 229 - if ((pvmw->flags & PVMW_SYNC) && 230 230 - PageTransCompound(pvmw->page)) { 231 231 - spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); 234 234 + } else if (!pmd_present(pmde)) { 235 235 + /* 236 236 + * If PVMW_SYNC, take and drop THP pmd lock so that we 237 237 + * cannot return prematurely, while zap_huge_pmd() has 238 238 + * cleared *pmd but not decremented compound_mapcount(). 239 239 + */ 240 240 + if ((pvmw->flags & PVMW_SYNC) && 241 241 + PageTransCompound(page)) { 242 242 + spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); 232 243 233 233 - spin_unlock(ptl); 244 244 + spin_unlock(ptl); 245 245 + } 246 246 + step_forward(pvmw, PMD_SIZE); 247 247 + continue; 234 248 } 235 235 - return false; 236 236 - } 237 237 - if (!map_pte(pvmw)) 238 238 - goto next_pte; 239 239 - while (1) { 240 240 - unsigned long end; 241 241 - 249 249 + if (!map_pte(pvmw)) 250 250 + goto next_pte; 251 251 + this_pte: 242 252 if (check_pte(pvmw)) 243 253 return true; 244 254 next_pte: 245 245 - /* Seek to next pte only makes sense for THP */ 246 246 - if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page)) 247 247 - return not_found(pvmw); 248 248 - end = vma_address_end(pvmw->page, pvmw->vma); 249 255 do { 250 256 pvmw->address += PAGE_SIZE; 251 257 if (pvmw->address >= end) 252 258 return not_found(pvmw); 253 259 /* Did we cross page table boundary? */ 254 254 - if (pvmw->address % PMD_SIZE == 0) { 255 255 - pte_unmap(pvmw->pte); 260 260 + if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) { 256 261 if (pvmw->ptl) { 257 262 spin_unlock(pvmw->ptl); 258 263 pvmw->ptl = NULL; 259 264 } 265 265 + pte_unmap(pvmw->pte); 266 266 + pvmw->pte = NULL; 260 267 goto restart; 261 261 - } else { 262 262 - pvmw->pte++; 268 268 + } 269 269 + pvmw->pte++; 270 270 + if ((pvmw->flags & PVMW_SYNC) && !pvmw->ptl) { 271 271 + pvmw->ptl = pte_lockptr(mm, pvmw->pmd); 272 272 + spin_lock(pvmw->ptl); 263 273 } 264 274 } while (pte_none(*pvmw->pte)); 265 275 ··· 286 258 pvmw->ptl = pte_lockptr(mm, pvmw->pmd); 287 259 spin_lock(pvmw->ptl); 288 260 } 289 289 - } 261 261 + goto this_pte; 262 262 + } while (pvmw->address < end); 263 263 + 264 264 + return false; 290 265 } 291 266 292 267 /**

+31 -10

mm/vmalloc.c

reviewed

··· 2344 2344 } 2345 2345 2346 2346 static struct vm_struct *__get_vm_area_node(unsigned long size, 2347 2347 - unsigned long align, unsigned long flags, unsigned long start, 2348 2348 - unsigned long end, int node, gfp_t gfp_mask, const void *caller) 2347 2347 + unsigned long align, unsigned long shift, unsigned long flags, 2348 2348 + unsigned long start, unsigned long end, int node, 2349 2349 + gfp_t gfp_mask, const void *caller) 2349 2350 { 2350 2351 struct vmap_area *va; 2351 2352 struct vm_struct *area; 2352 2353 unsigned long requested_size = size; 2353 2354 2354 2355 BUG_ON(in_interrupt()); 2355 2355 - size = PAGE_ALIGN(size); 2356 2356 + size = ALIGN(size, 1ul << shift); 2356 2357 if (unlikely(!size)) 2357 2358 return NULL; 2358 2359 ··· 2385 2384 unsigned long start, unsigned long end, 2386 2385 const void *caller) 2387 2386 { 2388 2388 - return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, 2389 2389 - GFP_KERNEL, caller); 2387 2387 + return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end, 2388 2388 + NUMA_NO_NODE, GFP_KERNEL, caller); 2390 2389 } 2391 2390 2392 2391 /** ··· 2402 2401 */ 2403 2402 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 2404 2403 { 2405 2405 - return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 2404 2404 + return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, 2405 2405 + VMALLOC_START, VMALLOC_END, 2406 2406 NUMA_NO_NODE, GFP_KERNEL, 2407 2407 __builtin_return_address(0)); 2408 2408 } ··· 2411 2409 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 2412 2410 const void *caller) 2413 2411 { 2414 2414 - return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 2412 2412 + return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, 2413 2413 + VMALLOC_START, VMALLOC_END, 2415 2414 NUMA_NO_NODE, GFP_KERNEL, caller); 2416 2415 } 2417 2416 ··· 2905 2902 } 2906 2903 2907 2904 again: 2908 2908 - size = PAGE_ALIGN(size); 2909 2909 - area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | 2910 2910 - vm_flags, start, end, node, gfp_mask, caller); 2905 2905 + area = __get_vm_area_node(real_size, align, shift, VM_ALLOC | 2906 2906 + VM_UNINITIALIZED | vm_flags, start, end, node, 2907 2907 + gfp_mask, caller); 2911 2908 if (!area) { 2912 2909 warn_alloc(gfp_mask, NULL, 2913 2910 "vmalloc size %lu allocation failure: " ··· 2926 2923 */ 2927 2924 clear_vm_uninitialized_flag(area); 2928 2925 2926 2926 + size = PAGE_ALIGN(size); 2929 2927 kmemleak_vmalloc(area, size, gfp_mask); 2930 2928 2931 2929 return addr; ··· 3001 2997 __builtin_return_address(0)); 3002 2998 } 3003 2999 EXPORT_SYMBOL(vmalloc); 3000 3000 + 3001 3001 + /** 3002 3002 + * vmalloc_no_huge - allocate virtually contiguous memory using small pages 3003 3003 + * @size: allocation size 3004 3004 + * 3005 3005 + * Allocate enough non-huge pages to cover @size from the page level 3006 3006 + * allocator and map them into contiguous kernel virtual space. 3007 3007 + * 3008 3008 + * Return: pointer to the allocated memory or %NULL on error 3009 3009 + */ 3010 3010 + void *vmalloc_no_huge(unsigned long size) 3011 3011 + { 3012 3012 + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, 3013 3013 + GFP_KERNEL, PAGE_KERNEL, VM_NO_HUGE_VMAP, 3014 3014 + NUMA_NO_NODE, __builtin_return_address(0)); 3015 3015 + } 3016 3016 + EXPORT_SYMBOL(vmalloc_no_huge); 3004 3017 3005 3018 /** 3006 3019 * vzalloc - allocate virtually contiguous memory with zero fill