Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'powerpc-4.1-2' of git://git.kernel.org/pub/scm/linux/kernel/git/mpe/linux

Pull powerpc fixes from Michael Ellerman:

- fix for mm_dec_nr_pmds() from Scott.

- fixes for oopses seen with KVM + THP from Aneesh.

- build fixes from Aneesh & Shreyas.

* tag 'powerpc-4.1-2' of git://git.kernel.org/pub/scm/linux/kernel/git/mpe/linux:
powerpc/mm: Fix build error with CONFIG_PPC_TRANSACTIONAL_MEM disabled
powerpc/kvm: Fix ppc64_defconfig + PPC_POWERNV=n build error
powerpc/mm/thp: Return pte address if we find trans_splitting.
powerpc/mm/thp: Make page table walk safe against thp split/collapse
KVM: PPC: Remove page table walk helpers
KVM: PPC: Use READ_ONCE when dereferencing pte_t pointer
powerpc/hugetlb: Call mm_dec_nr_pmds() in hugetlb_free_pmd_range()

+137 -117
+6 -11
arch/powerpc/include/asm/kvm_book3s_64.h
··· 295 295 296 296 /* 297 297 * If it's present and writable, atomically set dirty and referenced bits and 298 - * return the PTE, otherwise return 0. If we find a transparent hugepage 299 - * and if it is marked splitting we return 0; 298 + * return the PTE, otherwise return 0. 300 299 */ 301 - static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing, 302 - unsigned int hugepage) 300 + static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing) 303 301 { 304 302 pte_t old_pte, new_pte = __pte(0); 305 303 306 304 while (1) { 307 - old_pte = *ptep; 305 + /* 306 + * Make sure we don't reload from ptep 307 + */ 308 + old_pte = READ_ONCE(*ptep); 308 309 /* 309 310 * wait until _PAGE_BUSY is clear then set it atomically 310 311 */ ··· 313 312 cpu_relax(); 314 313 continue; 315 314 } 316 - #ifdef CONFIG_TRANSPARENT_HUGEPAGE 317 - /* If hugepage and is trans splitting return None */ 318 - if (unlikely(hugepage && 319 - pmd_trans_splitting(pte_pmd(old_pte)))) 320 - return __pte(0); 321 - #endif 322 315 /* If pte is not present return None */ 323 316 if (unlikely(!(pte_val(old_pte) & _PAGE_PRESENT))) 324 317 return __pte(0);
+8 -20
arch/powerpc/include/asm/pgtable.h
··· 247 247 #define pmd_large(pmd) 0 248 248 #define has_transparent_hugepage() 0 249 249 #endif 250 - pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, 250 + pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, 251 251 unsigned *shift); 252 - 253 - static inline pte_t *lookup_linux_ptep(pgd_t *pgdir, unsigned long hva, 254 - unsigned long *pte_sizep) 252 + static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, 253 + unsigned *shift) 255 254 { 256 - pte_t *ptep; 257 - unsigned long ps = *pte_sizep; 258 - unsigned int shift; 259 - 260 - ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift); 261 - if (!ptep) 262 - return NULL; 263 - if (shift) 264 - *pte_sizep = 1ul << shift; 265 - else 266 - *pte_sizep = PAGE_SIZE; 267 - 268 - if (ps > *pte_sizep) 269 - return NULL; 270 - 271 - return ptep; 255 + if (!arch_irqs_disabled()) { 256 + pr_info("%s called with irq enabled\n", __func__); 257 + dump_stack(); 258 + } 259 + return __find_linux_pte_or_hugepte(pgdir, ea, shift); 272 260 } 273 261 #endif /* __ASSEMBLY__ */ 274 262
+4 -2
arch/powerpc/kernel/eeh.c
··· 334 334 int hugepage_shift; 335 335 336 336 /* 337 - * We won't find hugepages here, iomem 337 + * We won't find hugepages here(this is iomem). Hence we are not 338 + * worried about _PAGE_SPLITTING/collapse. Also we will not hit 339 + * page table free, because of init_mm. 338 340 */ 339 - ptep = find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift); 341 + ptep = __find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift); 340 342 if (!ptep) 341 343 return token; 342 344 WARN_ON(hugepage_shift);
+5 -5
arch/powerpc/kernel/io-workarounds.c
··· 71 71 vaddr = (unsigned long)PCI_FIX_ADDR(addr); 72 72 if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END) 73 73 return NULL; 74 - 75 - ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr, 74 + /* 75 + * We won't find huge pages here (iomem). Also can't hit 76 + * a page table free due to init_mm 77 + */ 78 + ptep = __find_linux_pte_or_hugepte(init_mm.pgd, vaddr, 76 79 &hugepage_shift); 77 80 if (ptep == NULL) 78 81 paddr = 0; 79 82 else { 80 - /* 81 - * we don't have hugepages backing iomem 82 - */ 83 83 WARN_ON(hugepage_shift); 84 84 paddr = pte_pfn(*ptep) << PAGE_SHIFT; 85 85 }
+1 -1
arch/powerpc/kvm/Kconfig
··· 75 75 76 76 config KVM_BOOK3S_64_HV 77 77 tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host" 78 - depends on KVM_BOOK3S_64 78 + depends on KVM_BOOK3S_64 && PPC_POWERNV 79 79 select KVM_BOOK3S_HV_POSSIBLE 80 80 select MMU_NOTIFIER 81 81 select CMA
+6 -8
arch/powerpc/kvm/book3s_64_mmu_hv.c
··· 535 535 } 536 536 /* if the guest wants write access, see if that is OK */ 537 537 if (!writing && hpte_is_writable(r)) { 538 - unsigned int hugepage_shift; 539 538 pte_t *ptep, pte; 540 - 539 + unsigned long flags; 541 540 /* 542 541 * We need to protect against page table destruction 543 - * while looking up and updating the pte. 542 + * hugepage split and collapse. 544 543 */ 545 - rcu_read_lock_sched(); 544 + local_irq_save(flags); 546 545 ptep = find_linux_pte_or_hugepte(current->mm->pgd, 547 - hva, &hugepage_shift); 546 + hva, NULL); 548 547 if (ptep) { 549 - pte = kvmppc_read_update_linux_pte(ptep, 1, 550 - hugepage_shift); 548 + pte = kvmppc_read_update_linux_pte(ptep, 1); 551 549 if (pte_write(pte)) 552 550 write_ok = 1; 553 551 } 554 - rcu_read_unlock_sched(); 552 + local_irq_restore(flags); 555 553 } 556 554 } 557 555
+47 -39
arch/powerpc/kvm/book3s_hv_rm_mmu.c
··· 26 26 { 27 27 unsigned long addr = (unsigned long) x; 28 28 pte_t *p; 29 - 30 - p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL); 29 + /* 30 + * assume we don't have huge pages in vmalloc space... 31 + * So don't worry about THP collapse/split. Called 32 + * Only in realmode, hence won't need irq_save/restore. 33 + */ 34 + p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL); 31 35 if (!p || !pte_present(*p)) 32 36 return NULL; 33 - /* assume we don't have huge pages in vmalloc space... */ 34 37 addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK); 35 38 return __va(addr); 36 39 } ··· 134 131 unlock_rmap(rmap); 135 132 } 136 133 137 - static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva, 138 - int writing, unsigned long *pte_sizep) 139 - { 140 - pte_t *ptep; 141 - unsigned long ps = *pte_sizep; 142 - unsigned int hugepage_shift; 143 - 144 - ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift); 145 - if (!ptep) 146 - return __pte(0); 147 - if (hugepage_shift) 148 - *pte_sizep = 1ul << hugepage_shift; 149 - else 150 - *pte_sizep = PAGE_SIZE; 151 - if (ps > *pte_sizep) 152 - return __pte(0); 153 - return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift); 154 - } 155 - 156 134 long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, 157 135 long pte_index, unsigned long pteh, unsigned long ptel, 158 136 pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret) ··· 144 160 struct revmap_entry *rev; 145 161 unsigned long g_ptel; 146 162 struct kvm_memory_slot *memslot; 147 - unsigned long pte_size; 163 + unsigned hpage_shift; 148 164 unsigned long is_io; 149 165 unsigned long *rmap; 150 - pte_t pte; 166 + pte_t *ptep; 151 167 unsigned int writing; 152 168 unsigned long mmu_seq; 153 - unsigned long rcbits; 169 + unsigned long rcbits, irq_flags = 0; 154 170 155 171 psize = hpte_page_size(pteh, ptel); 156 172 if (!psize) ··· 186 202 187 203 /* Translate to host virtual address */ 188 204 hva = __gfn_to_hva_memslot(memslot, gfn); 189 - 190 - /* Look up the Linux PTE for the backing page */ 191 - pte_size = psize; 192 - pte = lookup_linux_pte_and_update(pgdir, hva, writing, &pte_size); 193 - if (pte_present(pte) && !pte_protnone(pte)) { 194 - if (writing && !pte_write(pte)) 195 - /* make the actual HPTE be read-only */ 196 - ptel = hpte_make_readonly(ptel); 197 - is_io = hpte_cache_bits(pte_val(pte)); 198 - pa = pte_pfn(pte) << PAGE_SHIFT; 199 - pa |= hva & (pte_size - 1); 200 - pa |= gpa & ~PAGE_MASK; 205 + /* 206 + * If we had a page table table change after lookup, we would 207 + * retry via mmu_notifier_retry. 208 + */ 209 + if (realmode) 210 + ptep = __find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift); 211 + else { 212 + local_irq_save(irq_flags); 213 + ptep = find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift); 201 214 } 215 + if (ptep) { 216 + pte_t pte; 217 + unsigned int host_pte_size; 202 218 203 - if (pte_size < psize) 204 - return H_PARAMETER; 219 + if (hpage_shift) 220 + host_pte_size = 1ul << hpage_shift; 221 + else 222 + host_pte_size = PAGE_SIZE; 223 + /* 224 + * We should always find the guest page size 225 + * to <= host page size, if host is using hugepage 226 + */ 227 + if (host_pte_size < psize) { 228 + if (!realmode) 229 + local_irq_restore(flags); 230 + return H_PARAMETER; 231 + } 232 + pte = kvmppc_read_update_linux_pte(ptep, writing); 233 + if (pte_present(pte) && !pte_protnone(pte)) { 234 + if (writing && !pte_write(pte)) 235 + /* make the actual HPTE be read-only */ 236 + ptel = hpte_make_readonly(ptel); 237 + is_io = hpte_cache_bits(pte_val(pte)); 238 + pa = pte_pfn(pte) << PAGE_SHIFT; 239 + pa |= hva & (host_pte_size - 1); 240 + pa |= gpa & ~PAGE_MASK; 241 + } 242 + } 243 + if (!realmode) 244 + local_irq_restore(irq_flags); 205 245 206 246 ptel &= ~(HPTE_R_PP0 - psize); 207 247 ptel |= pa;
+23 -9
arch/powerpc/kvm/e500_mmu_host.c
··· 338 338 pte_t *ptep; 339 339 unsigned int wimg = 0; 340 340 pgd_t *pgdir; 341 + unsigned long flags; 341 342 342 343 /* used to check for invalidations in progress */ 343 344 mmu_seq = kvm->mmu_notifier_seq; ··· 469 468 470 469 471 470 pgdir = vcpu_e500->vcpu.arch.pgdir; 472 - ptep = lookup_linux_ptep(pgdir, hva, &tsize_pages); 473 - if (pte_present(*ptep)) 474 - wimg = (*ptep >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; 475 - else { 476 - if (printk_ratelimit()) 477 - pr_err("%s: pte not present: gfn %lx, pfn %lx\n", 478 - __func__, (long)gfn, pfn); 479 - ret = -EINVAL; 480 - goto out; 471 + /* 472 + * We are just looking at the wimg bits, so we don't 473 + * care much about the trans splitting bit. 474 + * We are holding kvm->mmu_lock so a notifier invalidate 475 + * can't run hence pfn won't change. 476 + */ 477 + local_irq_save(flags); 478 + ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL); 479 + if (ptep) { 480 + pte_t pte = READ_ONCE(*ptep); 481 + 482 + if (pte_present(pte)) { 483 + wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) & 484 + MAS2_WIMGE_MASK; 485 + local_irq_restore(flags); 486 + } else { 487 + local_irq_restore(flags); 488 + pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n", 489 + __func__, (long)gfn, pfn); 490 + ret = -EINVAL; 491 + goto out; 492 + } 481 493 } 482 494 kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); 483 495
+2 -1
arch/powerpc/mm/hash_utils_64.c
··· 1066 1066 #endif /* CONFIG_PPC_64K_PAGES */ 1067 1067 1068 1068 /* Get PTE and page size from page tables */ 1069 - ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift); 1069 + ptep = __find_linux_pte_or_hugepte(pgdir, ea, &hugeshift); 1070 1070 if (ptep == NULL || !pte_present(*ptep)) { 1071 1071 DBG_LOW(" no PTE !\n"); 1072 1072 rc = 1; ··· 1394 1394 tm_abort(TM_CAUSE_TLBI); 1395 1395 } 1396 1396 #endif 1397 + return; 1397 1398 } 1398 1399 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1399 1400
+21 -11
arch/powerpc/mm/hugetlbpage.c
··· 109 109 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 110 110 { 111 111 /* Only called for hugetlbfs pages, hence can ignore THP */ 112 - return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); 112 + return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL); 113 113 } 114 114 115 115 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, ··· 581 581 pmd = pmd_offset(pud, start); 582 582 pud_clear(pud); 583 583 pmd_free_tlb(tlb, pmd, start); 584 + mm_dec_nr_pmds(tlb->mm); 584 585 } 585 586 586 587 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, ··· 682 681 } while (addr = next, addr != end); 683 682 } 684 683 684 + /* 685 + * We are holding mmap_sem, so a parallel huge page collapse cannot run. 686 + * To prevent hugepage split, disable irq. 687 + */ 685 688 struct page * 686 689 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 687 690 { 688 691 pte_t *ptep; 689 692 struct page *page; 690 693 unsigned shift; 691 - unsigned long mask; 694 + unsigned long mask, flags; 692 695 /* 693 696 * Transparent hugepages are handled by generic code. We can skip them 694 697 * here. 695 698 */ 699 + local_irq_save(flags); 696 700 ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); 697 701 698 702 /* Verify it is a huge page else bail. */ 699 - if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) 703 + if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) { 704 + local_irq_restore(flags); 700 705 return ERR_PTR(-EINVAL); 701 - 706 + } 702 707 mask = (1UL << shift) - 1; 703 708 page = pte_page(*ptep); 704 709 if (page) 705 710 page += (address & mask) / PAGE_SIZE; 706 711 712 + local_irq_restore(flags); 707 713 return page; 708 714 } 709 715 ··· 957 949 * 958 950 * So long as we atomically load page table pointers we are safe against teardown, 959 951 * we can follow the address down to the the page and take a ref on it. 952 + * This function need to be called with interrupts disabled. We use this variant 953 + * when we have MSR[EE] = 0 but the paca->soft_enabled = 1 960 954 */ 961 955 962 - pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) 956 + pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, 957 + unsigned *shift) 963 958 { 964 959 pgd_t pgd, *pgdp; 965 960 pud_t pud, *pudp; ··· 1014 1003 * A hugepage collapse is captured by pmd_none, because 1015 1004 * it mark the pmd none and do a hpte invalidate. 1016 1005 * 1017 - * A hugepage split is captured by pmd_trans_splitting 1018 - * because we mark the pmd trans splitting and do a 1019 - * hpte invalidate 1020 - * 1006 + * We don't worry about pmd_trans_splitting here, The 1007 + * caller if it needs to handle the splitting case 1008 + * should check for that. 1021 1009 */ 1022 - if (pmd_none(pmd) || pmd_trans_splitting(pmd)) 1010 + if (pmd_none(pmd)) 1023 1011 return NULL; 1024 1012 1025 1013 if (pmd_huge(pmd) || pmd_large(pmd)) { ··· 1040 1030 *shift = pdshift; 1041 1031 return ret_pte; 1042 1032 } 1043 - EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); 1033 + EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte); 1044 1034 1045 1035 int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, 1046 1036 unsigned long end, int write, struct page **pages, int *nr)
+14 -10
arch/powerpc/perf/callchain.c
··· 111 111 * interrupt context, so if the access faults, we read the page tables 112 112 * to find which page (if any) is mapped and access it directly. 113 113 */ 114 - static int read_user_stack_slow(void __user *ptr, void *ret, int nb) 114 + static int read_user_stack_slow(void __user *ptr, void *buf, int nb) 115 115 { 116 + int ret = -EFAULT; 116 117 pgd_t *pgdir; 117 118 pte_t *ptep, pte; 118 119 unsigned shift; 119 120 unsigned long addr = (unsigned long) ptr; 120 121 unsigned long offset; 121 - unsigned long pfn; 122 + unsigned long pfn, flags; 122 123 void *kaddr; 123 124 124 125 pgdir = current->mm->pgd; 125 126 if (!pgdir) 126 127 return -EFAULT; 127 128 129 + local_irq_save(flags); 128 130 ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift); 131 + if (!ptep) 132 + goto err_out; 129 133 if (!shift) 130 134 shift = PAGE_SHIFT; 131 135 132 136 /* align address to page boundary */ 133 137 offset = addr & ((1UL << shift) - 1); 134 - addr -= offset; 135 138 136 - if (ptep == NULL) 137 - return -EFAULT; 138 - pte = *ptep; 139 + pte = READ_ONCE(*ptep); 139 140 if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER)) 140 - return -EFAULT; 141 + goto err_out; 141 142 pfn = pte_pfn(pte); 142 143 if (!page_is_ram(pfn)) 143 - return -EFAULT; 144 + goto err_out; 144 145 145 146 /* no highmem to worry about here */ 146 147 kaddr = pfn_to_kaddr(pfn); 147 - memcpy(ret, kaddr + offset, nb); 148 - return 0; 148 + memcpy(buf, kaddr + offset, nb); 149 + ret = 0; 150 + err_out: 151 + local_irq_restore(flags); 152 + return ret; 149 153 } 150 154 151 155 static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)