Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
"24 patches, based on 4a09d388f2ab382f217a764e6a152b3f614246f6.

Subsystems affected by this patch series: mm (thp, vmalloc, hugetlb,
memory-failure, and pagealloc), nilfs2, kthread, MAINTAINERS, and
mailmap"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (24 commits)
mailmap: add Marek's other e-mail address and identity without diacritics
MAINTAINERS: fix Marek's identity again
mm/page_alloc: do bulk array bounds check after checking populated elements
mm/page_alloc: __alloc_pages_bulk(): do bounds check before accessing array
mm/hwpoison: do not lock page again when me_huge_page() successfully recovers
mm,hwpoison: return -EHWPOISON to denote that the page has already been poisoned
mm/memory-failure: use a mutex to avoid memory_failure() races
mm, futex: fix shared futex pgoff on shmem huge page
kthread: prevent deadlock when kthread_mod_delayed_work() races with kthread_cancel_delayed_work_sync()
kthread_worker: split code for canceling the delayed work timer
mm/vmalloc: unbreak kasan vmalloc support
KVM: s390: prepare for hugepage vmalloc
mm/vmalloc: add vmalloc_no_huge
nilfs2: fix memory leak in nilfs_sysfs_delete_device_group
mm/thp: another PVMW_SYNC fix in page_vma_mapped_walk()
mm/thp: fix page_vma_mapped_walk() if THP mapped by ptes
mm: page_vma_mapped_walk(): get vma_address_end() earlier
mm: page_vma_mapped_walk(): use goto instead of while (1)
mm: page_vma_mapped_walk(): add a level of indentation
mm: page_vma_mapped_walk(): crossing page table boundary
...

+261 -163
+2
.mailmap
··· 212 212 Manivannan Sadhasivam <mani@kernel.org> <manivannan.sadhasivam@linaro.org> 213 213 Marcin Nowakowski <marcin.nowakowski@mips.com> <marcin.nowakowski@imgtec.com> 214 214 Marc Zyngier <maz@kernel.org> <marc.zyngier@arm.com> 215 + Marek Behún <kabel@kernel.org> <marek.behun@nic.cz> 216 + Marek Behún <kabel@kernel.org> Marek Behun <marek.behun@nic.cz> 215 217 Mark Brown <broonie@sirena.org.uk> 216 218 Mark Starovoytov <mstarovo@pm.me> <mstarovoitov@marvell.com> 217 219 Mark Yao <markyao0591@gmail.com> <mark.yao@rock-chips.com>
+2 -2
MAINTAINERS
··· 1816 1816 F: drivers/rtc/rtc-ftrtc010.c 1817 1817 1818 1818 ARM/CZ.NIC TURRIS SUPPORT 1819 - M: Marek Behun <kabel@kernel.org> 1819 + M: Marek Behún <kabel@kernel.org> 1820 1820 S: Maintained 1821 1821 W: https://www.turris.cz/ 1822 1822 F: Documentation/ABI/testing/debugfs-moxtet ··· 10945 10945 10946 10946 MARVELL MV88X3310 PHY DRIVER 10947 10947 M: Russell King <linux@armlinux.org.uk> 10948 - M: Marek Behun <marek.behun@nic.cz> 10948 + M: Marek Behún <kabel@kernel.org> 10949 10949 L: netdev@vger.kernel.org 10950 10950 S: Maintained 10951 10951 F: drivers/net/phy/marvell10g.c
+6 -1
arch/s390/kvm/pv.c
··· 140 140 /* Allocate variable storage */ 141 141 vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE); 142 142 vlen += uv_info.guest_virt_base_stor_len; 143 - kvm->arch.pv.stor_var = vzalloc(vlen); 143 + /* 144 + * The Create Secure Configuration Ultravisor Call does not support 145 + * using large pages for the virtual memory area. 146 + * This is a hardware limitation. 147 + */ 148 + kvm->arch.pv.stor_var = vmalloc_no_huge(vlen); 144 149 if (!kvm->arch.pv.stor_var) 145 150 goto out_err; 146 151 return 0;
+1
fs/nilfs2/sysfs.c
··· 1053 1053 nilfs_sysfs_delete_superblock_group(nilfs); 1054 1054 nilfs_sysfs_delete_segctor_group(nilfs); 1055 1055 kobject_del(&nilfs->ns_dev_kobj); 1056 + kobject_put(&nilfs->ns_dev_kobj); 1056 1057 kfree(nilfs->ns_dev_subgroups); 1057 1058 } 1058 1059
-16
include/linux/hugetlb.h
··· 741 741 return h - hstates; 742 742 } 743 743 744 - pgoff_t __basepage_index(struct page *page); 745 - 746 - /* Return page->index in PAGE_SIZE units */ 747 - static inline pgoff_t basepage_index(struct page *page) 748 - { 749 - if (!PageCompound(page)) 750 - return page->index; 751 - 752 - return __basepage_index(page); 753 - } 754 - 755 744 extern int dissolve_free_huge_page(struct page *page); 756 745 extern int dissolve_free_huge_pages(unsigned long start_pfn, 757 746 unsigned long end_pfn); ··· 975 986 static inline int hstate_index(struct hstate *h) 976 987 { 977 988 return 0; 978 - } 979 - 980 - static inline pgoff_t basepage_index(struct page *page) 981 - { 982 - return page->index; 983 989 } 984 990 985 991 static inline int dissolve_free_huge_page(struct page *page)
+7 -6
include/linux/pagemap.h
··· 516 516 } 517 517 518 518 /* 519 - * Get index of the page with in radix-tree 519 + * Get index of the page within radix-tree (but not for hugetlb pages). 520 520 * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE) 521 521 */ 522 522 static inline pgoff_t page_to_index(struct page *page) ··· 535 535 return pgoff; 536 536 } 537 537 538 + extern pgoff_t hugetlb_basepage_index(struct page *page); 539 + 538 540 /* 539 - * Get the offset in PAGE_SIZE. 540 - * (TODO: hugepage should have ->index in PAGE_SIZE) 541 + * Get the offset in PAGE_SIZE (even for hugetlb pages). 542 + * (TODO: hugetlb pages should have ->index in PAGE_SIZE) 541 543 */ 542 544 static inline pgoff_t page_to_pgoff(struct page *page) 543 545 { 544 - if (unlikely(PageHeadHuge(page))) 545 - return page->index << compound_order(page); 546 - 546 + if (unlikely(PageHuge(page))) 547 + return hugetlb_basepage_index(page); 547 548 return page_to_index(page); 548 549 } 549 550
+1
include/linux/vmalloc.h
··· 135 135 const void *caller); 136 136 void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, 137 137 int node, const void *caller); 138 + void *vmalloc_no_huge(unsigned long size); 138 139 139 140 extern void vfree(const void *addr); 140 141 extern void vfree_atomic(const void *addr);
+1 -2
kernel/futex.c
··· 35 35 #include <linux/jhash.h> 36 36 #include <linux/pagemap.h> 37 37 #include <linux/syscalls.h> 38 - #include <linux/hugetlb.h> 39 38 #include <linux/freezer.h> 40 39 #include <linux/memblock.h> 41 40 #include <linux/fault-inject.h> ··· 649 650 650 651 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 651 652 key->shared.i_seq = get_inode_sequence_number(inode); 652 - key->shared.pgoff = basepage_index(tail); 653 + key->shared.pgoff = page_to_pgoff(tail); 653 654 rcu_read_unlock(); 654 655 } 655 656
+51 -26
kernel/kthread.c
··· 1093 1093 EXPORT_SYMBOL_GPL(kthread_flush_work); 1094 1094 1095 1095 /* 1096 - * This function removes the work from the worker queue. Also it makes sure 1097 - * that it won't get queued later via the delayed work's timer. 1096 + * Make sure that the timer is neither set nor running and could 1097 + * not manipulate the work list_head any longer. 1098 + * 1099 + * The function is called under worker->lock. The lock is temporary 1100 + * released but the timer can't be set again in the meantime. 1101 + */ 1102 + static void kthread_cancel_delayed_work_timer(struct kthread_work *work, 1103 + unsigned long *flags) 1104 + { 1105 + struct kthread_delayed_work *dwork = 1106 + container_of(work, struct kthread_delayed_work, work); 1107 + struct kthread_worker *worker = work->worker; 1108 + 1109 + /* 1110 + * del_timer_sync() must be called to make sure that the timer 1111 + * callback is not running. The lock must be temporary released 1112 + * to avoid a deadlock with the callback. In the meantime, 1113 + * any queuing is blocked by setting the canceling counter. 1114 + */ 1115 + work->canceling++; 1116 + raw_spin_unlock_irqrestore(&worker->lock, *flags); 1117 + del_timer_sync(&dwork->timer); 1118 + raw_spin_lock_irqsave(&worker->lock, *flags); 1119 + work->canceling--; 1120 + } 1121 + 1122 + /* 1123 + * This function removes the work from the worker queue. 1124 + * 1125 + * It is called under worker->lock. The caller must make sure that 1126 + * the timer used by delayed work is not running, e.g. by calling 1127 + * kthread_cancel_delayed_work_timer(). 1098 1128 * 1099 1129 * The work might still be in use when this function finishes. See the 1100 1130 * current_work proceed by the worker. ··· 1132 1102 * Return: %true if @work was pending and successfully canceled, 1133 1103 * %false if @work was not pending 1134 1104 */ 1135 - static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork, 1136 - unsigned long *flags) 1105 + static bool __kthread_cancel_work(struct kthread_work *work) 1137 1106 { 1138 - /* Try to cancel the timer if exists. */ 1139 - if (is_dwork) { 1140 - struct kthread_delayed_work *dwork = 1141 - container_of(work, struct kthread_delayed_work, work); 1142 - struct kthread_worker *worker = work->worker; 1143 - 1144 - /* 1145 - * del_timer_sync() must be called to make sure that the timer 1146 - * callback is not running. The lock must be temporary released 1147 - * to avoid a deadlock with the callback. In the meantime, 1148 - * any queuing is blocked by setting the canceling counter. 1149 - */ 1150 - work->canceling++; 1151 - raw_spin_unlock_irqrestore(&worker->lock, *flags); 1152 - del_timer_sync(&dwork->timer); 1153 - raw_spin_lock_irqsave(&worker->lock, *flags); 1154 - work->canceling--; 1155 - } 1156 - 1157 1107 /* 1158 1108 * Try to remove the work from a worker list. It might either 1159 1109 * be from worker->work_list or from worker->delayed_work_list. ··· 1186 1176 /* Work must not be used with >1 worker, see kthread_queue_work() */ 1187 1177 WARN_ON_ONCE(work->worker != worker); 1188 1178 1189 - /* Do not fight with another command that is canceling this work. */ 1179 + /* 1180 + * Temporary cancel the work but do not fight with another command 1181 + * that is canceling the work as well. 1182 + * 1183 + * It is a bit tricky because of possible races with another 1184 + * mod_delayed_work() and cancel_delayed_work() callers. 1185 + * 1186 + * The timer must be canceled first because worker->lock is released 1187 + * when doing so. But the work can be removed from the queue (list) 1188 + * only when it can be queued again so that the return value can 1189 + * be used for reference counting. 1190 + */ 1191 + kthread_cancel_delayed_work_timer(work, &flags); 1190 1192 if (work->canceling) 1191 1193 goto out; 1194 + ret = __kthread_cancel_work(work); 1192 1195 1193 - ret = __kthread_cancel_work(work, true, &flags); 1194 1196 fast_queue: 1195 1197 __kthread_queue_delayed_work(worker, dwork, delay); 1196 1198 out: ··· 1224 1202 /* Work must not be used with >1 worker, see kthread_queue_work(). */ 1225 1203 WARN_ON_ONCE(work->worker != worker); 1226 1204 1227 - ret = __kthread_cancel_work(work, is_dwork, &flags); 1205 + if (is_dwork) 1206 + kthread_cancel_delayed_work_timer(work, &flags); 1207 + 1208 + ret = __kthread_cancel_work(work); 1228 1209 1229 1210 if (worker->current_work != work) 1230 1211 goto out_fast;
+1 -4
mm/hugetlb.c
··· 1588 1588 return NULL; 1589 1589 } 1590 1590 1591 - pgoff_t __basepage_index(struct page *page) 1591 + pgoff_t hugetlb_basepage_index(struct page *page) 1592 1592 { 1593 1593 struct page *page_head = compound_head(page); 1594 1594 pgoff_t index = page_index(page_head); 1595 1595 unsigned long compound_idx; 1596 - 1597 - if (!PageHuge(page_head)) 1598 - return page_index(page); 1599 1596 1600 1597 if (compound_order(page_head) >= MAX_ORDER) 1601 1598 compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
+55 -28
mm/memory-failure.c
··· 658 658 */ 659 659 static int me_kernel(struct page *p, unsigned long pfn) 660 660 { 661 + unlock_page(p); 661 662 return MF_IGNORED; 662 663 } 663 664 ··· 668 667 static int me_unknown(struct page *p, unsigned long pfn) 669 668 { 670 669 pr_err("Memory failure: %#lx: Unknown page state\n", pfn); 670 + unlock_page(p); 671 671 return MF_FAILED; 672 672 } 673 673 ··· 677 675 */ 678 676 static int me_pagecache_clean(struct page *p, unsigned long pfn) 679 677 { 678 + int ret; 680 679 struct address_space *mapping; 681 680 682 681 delete_from_lru_cache(p); ··· 686 683 * For anonymous pages we're done the only reference left 687 684 * should be the one m_f() holds. 688 685 */ 689 - if (PageAnon(p)) 690 - return MF_RECOVERED; 686 + if (PageAnon(p)) { 687 + ret = MF_RECOVERED; 688 + goto out; 689 + } 691 690 692 691 /* 693 692 * Now truncate the page in the page cache. This is really ··· 703 698 /* 704 699 * Page has been teared down in the meanwhile 705 700 */ 706 - return MF_FAILED; 701 + ret = MF_FAILED; 702 + goto out; 707 703 } 708 704 709 705 /* ··· 712 706 * 713 707 * Open: to take i_mutex or not for this? Right now we don't. 714 708 */ 715 - return truncate_error_page(p, pfn, mapping); 709 + ret = truncate_error_page(p, pfn, mapping); 710 + out: 711 + unlock_page(p); 712 + return ret; 716 713 } 717 714 718 715 /* ··· 791 782 */ 792 783 static int me_swapcache_dirty(struct page *p, unsigned long pfn) 793 784 { 785 + int ret; 786 + 794 787 ClearPageDirty(p); 795 788 /* Trigger EIO in shmem: */ 796 789 ClearPageUptodate(p); 797 790 798 - if (!delete_from_lru_cache(p)) 799 - return MF_DELAYED; 800 - else 801 - return MF_FAILED; 791 + ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED; 792 + unlock_page(p); 793 + return ret; 802 794 } 803 795 804 796 static int me_swapcache_clean(struct page *p, unsigned long pfn) 805 797 { 798 + int ret; 799 + 806 800 delete_from_swap_cache(p); 807 801 808 - if (!delete_from_lru_cache(p)) 809 - return MF_RECOVERED; 810 - else 811 - return MF_FAILED; 802 + ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED; 803 + unlock_page(p); 804 + return ret; 812 805 } 813 806 814 807 /* ··· 831 820 mapping = page_mapping(hpage); 832 821 if (mapping) { 833 822 res = truncate_error_page(hpage, pfn, mapping); 823 + unlock_page(hpage); 834 824 } else { 835 825 res = MF_FAILED; 836 826 unlock_page(hpage); ··· 846 834 page_ref_inc(p); 847 835 res = MF_RECOVERED; 848 836 } 849 - lock_page(hpage); 850 837 } 851 838 852 839 return res; ··· 877 866 unsigned long mask; 878 867 unsigned long res; 879 868 enum mf_action_page_type type; 869 + 870 + /* Callback ->action() has to unlock the relevant page inside it. */ 880 871 int (*action)(struct page *p, unsigned long pfn); 881 872 } error_states[] = { 882 873 { reserved, reserved, MF_MSG_KERNEL, me_kernel }, ··· 942 929 int result; 943 930 int count; 944 931 932 + /* page p should be unlocked after returning from ps->action(). */ 945 933 result = ps->action(p, pfn); 946 934 947 935 count = page_count(p) - 1; ··· 1267 1253 if (TestSetPageHWPoison(head)) { 1268 1254 pr_err("Memory failure: %#lx: already hardware poisoned\n", 1269 1255 pfn); 1270 - return 0; 1256 + return -EHWPOISON; 1271 1257 } 1272 1258 1273 1259 num_poisoned_pages_inc(); ··· 1327 1313 goto out; 1328 1314 } 1329 1315 1330 - res = identify_page_state(pfn, p, page_flags); 1316 + return identify_page_state(pfn, p, page_flags); 1331 1317 out: 1332 1318 unlock_page(head); 1333 1319 return res; ··· 1443 1429 struct page *hpage; 1444 1430 struct page *orig_head; 1445 1431 struct dev_pagemap *pgmap; 1446 - int res; 1432 + int res = 0; 1447 1433 unsigned long page_flags; 1448 1434 bool retry = true; 1435 + static DEFINE_MUTEX(mf_mutex); 1449 1436 1450 1437 if (!sysctl_memory_failure_recovery) 1451 1438 panic("Memory failure on page %lx", pfn); ··· 1464 1449 return -ENXIO; 1465 1450 } 1466 1451 1452 + mutex_lock(&mf_mutex); 1453 + 1467 1454 try_again: 1468 - if (PageHuge(p)) 1469 - return memory_failure_hugetlb(pfn, flags); 1455 + if (PageHuge(p)) { 1456 + res = memory_failure_hugetlb(pfn, flags); 1457 + goto unlock_mutex; 1458 + } 1459 + 1470 1460 if (TestSetPageHWPoison(p)) { 1471 1461 pr_err("Memory failure: %#lx: already hardware poisoned\n", 1472 1462 pfn); 1473 - return 0; 1463 + res = -EHWPOISON; 1464 + goto unlock_mutex; 1474 1465 } 1475 1466 1476 1467 orig_head = hpage = compound_head(p); ··· 1509 1488 res = MF_FAILED; 1510 1489 } 1511 1490 action_result(pfn, MF_MSG_BUDDY, res); 1512 - return res == MF_RECOVERED ? 0 : -EBUSY; 1491 + res = res == MF_RECOVERED ? 0 : -EBUSY; 1513 1492 } else { 1514 1493 action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); 1515 - return -EBUSY; 1494 + res = -EBUSY; 1516 1495 } 1496 + goto unlock_mutex; 1517 1497 } 1518 1498 1519 1499 if (PageTransHuge(hpage)) { 1520 1500 if (try_to_split_thp_page(p, "Memory Failure") < 0) { 1521 1501 action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); 1522 - return -EBUSY; 1502 + res = -EBUSY; 1503 + goto unlock_mutex; 1523 1504 } 1524 1505 VM_BUG_ON_PAGE(!page_count(p), p); 1525 1506 } ··· 1545 1522 if (PageCompound(p) && compound_head(p) != orig_head) { 1546 1523 action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); 1547 1524 res = -EBUSY; 1548 - goto out; 1525 + goto unlock_page; 1549 1526 } 1550 1527 1551 1528 /* ··· 1565 1542 num_poisoned_pages_dec(); 1566 1543 unlock_page(p); 1567 1544 put_page(p); 1568 - return 0; 1545 + goto unlock_mutex; 1569 1546 } 1570 1547 if (hwpoison_filter(p)) { 1571 1548 if (TestClearPageHWPoison(p)) 1572 1549 num_poisoned_pages_dec(); 1573 1550 unlock_page(p); 1574 1551 put_page(p); 1575 - return 0; 1552 + goto unlock_mutex; 1576 1553 } 1577 1554 1578 1555 /* ··· 1596 1573 if (!hwpoison_user_mappings(p, pfn, flags, &p)) { 1597 1574 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); 1598 1575 res = -EBUSY; 1599 - goto out; 1576 + goto unlock_page; 1600 1577 } 1601 1578 1602 1579 /* ··· 1605 1582 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { 1606 1583 action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); 1607 1584 res = -EBUSY; 1608 - goto out; 1585 + goto unlock_page; 1609 1586 } 1610 1587 1611 1588 identify_page_state: 1612 1589 res = identify_page_state(pfn, p, page_flags); 1613 - out: 1590 + mutex_unlock(&mf_mutex); 1591 + return res; 1592 + unlock_page: 1614 1593 unlock_page(p); 1594 + unlock_mutex: 1595 + mutex_unlock(&mf_mutex); 1615 1596 return res; 1616 1597 } 1617 1598 EXPORT_SYMBOL_GPL(memory_failure);
+5 -1
mm/page_alloc.c
··· 5053 5053 * Skip populated array elements to determine if any pages need 5054 5054 * to be allocated before disabling IRQs. 5055 5055 */ 5056 - while (page_array && page_array[nr_populated] && nr_populated < nr_pages) 5056 + while (page_array && nr_populated < nr_pages && page_array[nr_populated]) 5057 5057 nr_populated++; 5058 + 5059 + /* Already populated array? */ 5060 + if (unlikely(page_array && nr_pages - nr_populated == 0)) 5061 + return 0; 5058 5062 5059 5063 /* Use the single page allocator for one page. */ 5060 5064 if (nr_pages - nr_populated == 1)
+98 -67
mm/page_vma_mapped.c
··· 116 116 return pfn_is_match(pvmw->page, pfn); 117 117 } 118 118 119 + static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size) 120 + { 121 + pvmw->address = (pvmw->address + size) & ~(size - 1); 122 + if (!pvmw->address) 123 + pvmw->address = ULONG_MAX; 124 + } 125 + 119 126 /** 120 127 * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at 121 128 * @pvmw->address ··· 151 144 { 152 145 struct mm_struct *mm = pvmw->vma->vm_mm; 153 146 struct page *page = pvmw->page; 147 + unsigned long end; 154 148 pgd_t *pgd; 155 149 p4d_t *p4d; 156 150 pud_t *pud; ··· 161 153 if (pvmw->pmd && !pvmw->pte) 162 154 return not_found(pvmw); 163 155 164 - if (pvmw->pte) 165 - goto next_pte; 156 + if (unlikely(PageHuge(page))) { 157 + /* The only possible mapping was handled on last iteration */ 158 + if (pvmw->pte) 159 + return not_found(pvmw); 166 160 167 - if (unlikely(PageHuge(pvmw->page))) { 168 161 /* when pud is not present, pte will be NULL */ 169 162 pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page)); 170 163 if (!pvmw->pte) ··· 177 168 return not_found(pvmw); 178 169 return true; 179 170 } 180 - restart: 181 - pgd = pgd_offset(mm, pvmw->address); 182 - if (!pgd_present(*pgd)) 183 - return false; 184 - p4d = p4d_offset(pgd, pvmw->address); 185 - if (!p4d_present(*p4d)) 186 - return false; 187 - pud = pud_offset(p4d, pvmw->address); 188 - if (!pud_present(*pud)) 189 - return false; 190 - pvmw->pmd = pmd_offset(pud, pvmw->address); 191 - /* 192 - * Make sure the pmd value isn't cached in a register by the 193 - * compiler and used as a stale value after we've observed a 194 - * subsequent update. 195 - */ 196 - pmde = READ_ONCE(*pvmw->pmd); 197 - if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { 198 - pvmw->ptl = pmd_lock(mm, pvmw->pmd); 199 - if (likely(pmd_trans_huge(*pvmw->pmd))) { 200 - if (pvmw->flags & PVMW_MIGRATION) 201 - return not_found(pvmw); 202 - if (pmd_page(*pvmw->pmd) != page) 203 - return not_found(pvmw); 204 - return true; 205 - } else if (!pmd_present(*pvmw->pmd)) { 206 - if (thp_migration_supported()) { 207 - if (!(pvmw->flags & PVMW_MIGRATION)) 208 - return not_found(pvmw); 209 - if (is_migration_entry(pmd_to_swp_entry(*pvmw->pmd))) { 210 - swp_entry_t entry = pmd_to_swp_entry(*pvmw->pmd); 211 171 212 - if (migration_entry_to_page(entry) != page) 213 - return not_found(pvmw); 214 - return true; 215 - } 172 + /* 173 + * Seek to next pte only makes sense for THP. 174 + * But more important than that optimization, is to filter out 175 + * any PageKsm page: whose page->index misleads vma_address() 176 + * and vma_address_end() to disaster. 177 + */ 178 + end = PageTransCompound(page) ? 179 + vma_address_end(page, pvmw->vma) : 180 + pvmw->address + PAGE_SIZE; 181 + if (pvmw->pte) 182 + goto next_pte; 183 + restart: 184 + do { 185 + pgd = pgd_offset(mm, pvmw->address); 186 + if (!pgd_present(*pgd)) { 187 + step_forward(pvmw, PGDIR_SIZE); 188 + continue; 189 + } 190 + p4d = p4d_offset(pgd, pvmw->address); 191 + if (!p4d_present(*p4d)) { 192 + step_forward(pvmw, P4D_SIZE); 193 + continue; 194 + } 195 + pud = pud_offset(p4d, pvmw->address); 196 + if (!pud_present(*pud)) { 197 + step_forward(pvmw, PUD_SIZE); 198 + continue; 199 + } 200 + 201 + pvmw->pmd = pmd_offset(pud, pvmw->address); 202 + /* 203 + * Make sure the pmd value isn't cached in a register by the 204 + * compiler and used as a stale value after we've observed a 205 + * subsequent update. 206 + */ 207 + pmde = READ_ONCE(*pvmw->pmd); 208 + 209 + if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { 210 + pvmw->ptl = pmd_lock(mm, pvmw->pmd); 211 + pmde = *pvmw->pmd; 212 + if (likely(pmd_trans_huge(pmde))) { 213 + if (pvmw->flags & PVMW_MIGRATION) 214 + return not_found(pvmw); 215 + if (pmd_page(pmde) != page) 216 + return not_found(pvmw); 217 + return true; 216 218 } 217 - return not_found(pvmw); 218 - } else { 219 + if (!pmd_present(pmde)) { 220 + swp_entry_t entry; 221 + 222 + if (!thp_migration_supported() || 223 + !(pvmw->flags & PVMW_MIGRATION)) 224 + return not_found(pvmw); 225 + entry = pmd_to_swp_entry(pmde); 226 + if (!is_migration_entry(entry) || 227 + migration_entry_to_page(entry) != page) 228 + return not_found(pvmw); 229 + return true; 230 + } 219 231 /* THP pmd was split under us: handle on pte level */ 220 232 spin_unlock(pvmw->ptl); 221 233 pvmw->ptl = NULL; 222 - } 223 - } else if (!pmd_present(pmde)) { 224 - /* 225 - * If PVMW_SYNC, take and drop THP pmd lock so that we 226 - * cannot return prematurely, while zap_huge_pmd() has 227 - * cleared *pmd but not decremented compound_mapcount(). 228 - */ 229 - if ((pvmw->flags & PVMW_SYNC) && 230 - PageTransCompound(pvmw->page)) { 231 - spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); 234 + } else if (!pmd_present(pmde)) { 235 + /* 236 + * If PVMW_SYNC, take and drop THP pmd lock so that we 237 + * cannot return prematurely, while zap_huge_pmd() has 238 + * cleared *pmd but not decremented compound_mapcount(). 239 + */ 240 + if ((pvmw->flags & PVMW_SYNC) && 241 + PageTransCompound(page)) { 242 + spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); 232 243 233 - spin_unlock(ptl); 244 + spin_unlock(ptl); 245 + } 246 + step_forward(pvmw, PMD_SIZE); 247 + continue; 234 248 } 235 - return false; 236 - } 237 - if (!map_pte(pvmw)) 238 - goto next_pte; 239 - while (1) { 240 - unsigned long end; 241 - 249 + if (!map_pte(pvmw)) 250 + goto next_pte; 251 + this_pte: 242 252 if (check_pte(pvmw)) 243 253 return true; 244 254 next_pte: 245 - /* Seek to next pte only makes sense for THP */ 246 - if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page)) 247 - return not_found(pvmw); 248 - end = vma_address_end(pvmw->page, pvmw->vma); 249 255 do { 250 256 pvmw->address += PAGE_SIZE; 251 257 if (pvmw->address >= end) 252 258 return not_found(pvmw); 253 259 /* Did we cross page table boundary? */ 254 - if (pvmw->address % PMD_SIZE == 0) { 255 - pte_unmap(pvmw->pte); 260 + if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) { 256 261 if (pvmw->ptl) { 257 262 spin_unlock(pvmw->ptl); 258 263 pvmw->ptl = NULL; 259 264 } 265 + pte_unmap(pvmw->pte); 266 + pvmw->pte = NULL; 260 267 goto restart; 261 - } else { 262 - pvmw->pte++; 268 + } 269 + pvmw->pte++; 270 + if ((pvmw->flags & PVMW_SYNC) && !pvmw->ptl) { 271 + pvmw->ptl = pte_lockptr(mm, pvmw->pmd); 272 + spin_lock(pvmw->ptl); 263 273 } 264 274 } while (pte_none(*pvmw->pte)); 265 275 ··· 286 258 pvmw->ptl = pte_lockptr(mm, pvmw->pmd); 287 259 spin_lock(pvmw->ptl); 288 260 } 289 - } 261 + goto this_pte; 262 + } while (pvmw->address < end); 263 + 264 + return false; 290 265 } 291 266 292 267 /**
+31 -10
mm/vmalloc.c
··· 2344 2344 } 2345 2345 2346 2346 static struct vm_struct *__get_vm_area_node(unsigned long size, 2347 - unsigned long align, unsigned long flags, unsigned long start, 2348 - unsigned long end, int node, gfp_t gfp_mask, const void *caller) 2347 + unsigned long align, unsigned long shift, unsigned long flags, 2348 + unsigned long start, unsigned long end, int node, 2349 + gfp_t gfp_mask, const void *caller) 2349 2350 { 2350 2351 struct vmap_area *va; 2351 2352 struct vm_struct *area; 2352 2353 unsigned long requested_size = size; 2353 2354 2354 2355 BUG_ON(in_interrupt()); 2355 - size = PAGE_ALIGN(size); 2356 + size = ALIGN(size, 1ul << shift); 2356 2357 if (unlikely(!size)) 2357 2358 return NULL; 2358 2359 ··· 2385 2384 unsigned long start, unsigned long end, 2386 2385 const void *caller) 2387 2386 { 2388 - return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, 2389 - GFP_KERNEL, caller); 2387 + return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end, 2388 + NUMA_NO_NODE, GFP_KERNEL, caller); 2390 2389 } 2391 2390 2392 2391 /** ··· 2402 2401 */ 2403 2402 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 2404 2403 { 2405 - return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 2404 + return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, 2405 + VMALLOC_START, VMALLOC_END, 2406 2406 NUMA_NO_NODE, GFP_KERNEL, 2407 2407 __builtin_return_address(0)); 2408 2408 } ··· 2411 2409 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 2412 2410 const void *caller) 2413 2411 { 2414 - return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 2412 + return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, 2413 + VMALLOC_START, VMALLOC_END, 2415 2414 NUMA_NO_NODE, GFP_KERNEL, caller); 2416 2415 } 2417 2416 ··· 2905 2902 } 2906 2903 2907 2904 again: 2908 - size = PAGE_ALIGN(size); 2909 - area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | 2910 - vm_flags, start, end, node, gfp_mask, caller); 2905 + area = __get_vm_area_node(real_size, align, shift, VM_ALLOC | 2906 + VM_UNINITIALIZED | vm_flags, start, end, node, 2907 + gfp_mask, caller); 2911 2908 if (!area) { 2912 2909 warn_alloc(gfp_mask, NULL, 2913 2910 "vmalloc size %lu allocation failure: " ··· 2926 2923 */ 2927 2924 clear_vm_uninitialized_flag(area); 2928 2925 2926 + size = PAGE_ALIGN(size); 2929 2927 kmemleak_vmalloc(area, size, gfp_mask); 2930 2928 2931 2929 return addr; ··· 3001 2997 __builtin_return_address(0)); 3002 2998 } 3003 2999 EXPORT_SYMBOL(vmalloc); 3000 + 3001 + /** 3002 + * vmalloc_no_huge - allocate virtually contiguous memory using small pages 3003 + * @size: allocation size 3004 + * 3005 + * Allocate enough non-huge pages to cover @size from the page level 3006 + * allocator and map them into contiguous kernel virtual space. 3007 + * 3008 + * Return: pointer to the allocated memory or %NULL on error 3009 + */ 3010 + void *vmalloc_no_huge(unsigned long size) 3011 + { 3012 + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, 3013 + GFP_KERNEL, PAGE_KERNEL, VM_NO_HUGE_VMAP, 3014 + NUMA_NO_NODE, __builtin_return_address(0)); 3015 + } 3016 + EXPORT_SYMBOL(vmalloc_no_huge); 3004 3017 3005 3018 /** 3006 3019 * vzalloc - allocate virtually contiguous memory with zero fill