Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch kvm-arm64/user_mem_abort-rework into kvmarm-master/next

* kvm-arm64/user_mem_abort-rework: (30 commits)
: .
: user_mem_abort() has become an absolute pain to maintain,
: to the point that each single fix is likely to introduce
: *two* new bugs.
:
: Deconstruct the whole thing in logical units, reducing
: the amount of visible and/or mutable state between functions,
: and finally making the code a bit more maintainable.
: .
KVM: arm64: Convert gmem_abort() to struct kvm_s2_fault_desc
KVM: arm64: Simplify integration of adjust_nested_*_perms()
KVM: arm64: Directly expose mapping prot and kill kvm_s2_fault
KVM: arm64: Move device mapping management into kvm_s2_fault_pin_pfn()
KVM: arm64: Replace force_pte with a max_map_size attribute
KVM: arm64: Move kvm_s2_fault.{pfn,page} to kvm_s2_vma_info
KVM: arm64: Restrict the scope of the 'writable' attribute
KVM: arm64: Kill logging_active from kvm_s2_fault
KVM: arm64: Move VMA-related information to kvm_s2_fault_vma_info
KVM: arm64: Kill topup_memcache from kvm_s2_fault
KVM: arm64: Kill exec_fault from kvm_s2_fault
KVM: arm64: Kill write_fault from kvm_s2_fault
KVM: arm64: Constrain fault_granule to kvm_s2_fault_map()
KVM: arm64: Replace fault_is_perm with a helper
KVM: arm64: Move fault context to const structure
KVM: arm64: Make fault_ipa immutable
KVM: arm64: Kill fault->ipa
KVM: arm64: Clean up control flow in kvm_s2_fault_map()
KVM: arm64: Hoist MTE validation check out of MMU lock path
KVM: arm64: Optimize early exit checks in kvm_s2_fault_pin_pfn()
...

Signed-off-by: Marc Zyngier <maz@kernel.org>

+310 -220
+310 -220
arch/arm64/kvm/mmu.c
··· 1400 1400 */ 1401 1401 static long 1402 1402 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, 1403 - unsigned long hva, kvm_pfn_t *pfnp, 1404 - phys_addr_t *ipap) 1403 + unsigned long hva, kvm_pfn_t *pfnp, gfn_t *gfnp) 1405 1404 { 1406 1405 kvm_pfn_t pfn = *pfnp; 1406 + gfn_t gfn = *gfnp; 1407 1407 1408 1408 /* 1409 1409 * Make sure the adjustment is done only for THP pages. Also make ··· 1419 1419 if (sz < PMD_SIZE) 1420 1420 return PAGE_SIZE; 1421 1421 1422 - *ipap &= PMD_MASK; 1422 + gfn &= ~(PTRS_PER_PMD - 1); 1423 + *gfnp = gfn; 1423 1424 pfn &= ~(PTRS_PER_PMD - 1); 1424 1425 *pfnp = pfn; 1425 1426 ··· 1513 1512 } 1514 1513 } 1515 1514 1516 - static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache, 1517 - void **memcache) 1515 + static void *get_mmu_memcache(struct kvm_vcpu *vcpu) 1518 1516 { 1519 - int min_pages; 1520 - 1521 1517 if (!is_protected_kvm_enabled()) 1522 - *memcache = &vcpu->arch.mmu_page_cache; 1518 + return &vcpu->arch.mmu_page_cache; 1523 1519 else 1524 - *memcache = &vcpu->arch.pkvm_memcache; 1520 + return &vcpu->arch.pkvm_memcache; 1521 + } 1525 1522 1526 - if (!topup_memcache) 1527 - return 0; 1528 - 1529 - min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); 1523 + static int topup_mmu_memcache(struct kvm_vcpu *vcpu, void *memcache) 1524 + { 1525 + int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); 1530 1526 1531 1527 if (!is_protected_kvm_enabled()) 1532 - return kvm_mmu_topup_memory_cache(*memcache, min_pages); 1528 + return kvm_mmu_topup_memory_cache(memcache, min_pages); 1533 1529 1534 - return topup_hyp_memcache(*memcache, min_pages); 1530 + return topup_hyp_memcache(memcache, min_pages); 1535 1531 } 1536 1532 1537 1533 /* ··· 1541 1543 * TLB invalidation from the guest and used to limit the invalidation scope if a 1542 1544 * TTL hint or a range isn't provided. 1543 1545 */ 1544 - static void adjust_nested_fault_perms(struct kvm_s2_trans *nested, 1545 - enum kvm_pgtable_prot *prot, 1546 - bool *writable) 1546 + static enum kvm_pgtable_prot adjust_nested_fault_perms(struct kvm_s2_trans *nested, 1547 + enum kvm_pgtable_prot prot) 1547 1548 { 1548 - *writable &= kvm_s2_trans_writable(nested); 1549 + if (!kvm_s2_trans_writable(nested)) 1550 + prot &= ~KVM_PGTABLE_PROT_W; 1549 1551 if (!kvm_s2_trans_readable(nested)) 1550 - *prot &= ~KVM_PGTABLE_PROT_R; 1552 + prot &= ~KVM_PGTABLE_PROT_R; 1551 1553 1552 - *prot |= kvm_encode_nested_level(nested); 1554 + return prot | kvm_encode_nested_level(nested); 1553 1555 } 1554 1556 1555 - static void adjust_nested_exec_perms(struct kvm *kvm, 1556 - struct kvm_s2_trans *nested, 1557 - enum kvm_pgtable_prot *prot) 1557 + static enum kvm_pgtable_prot adjust_nested_exec_perms(struct kvm *kvm, 1558 + struct kvm_s2_trans *nested, 1559 + enum kvm_pgtable_prot prot) 1558 1560 { 1559 1561 if (!kvm_s2_trans_exec_el0(kvm, nested)) 1560 - *prot &= ~KVM_PGTABLE_PROT_UX; 1562 + prot &= ~KVM_PGTABLE_PROT_UX; 1561 1563 if (!kvm_s2_trans_exec_el1(kvm, nested)) 1562 - *prot &= ~KVM_PGTABLE_PROT_PX; 1564 + prot &= ~KVM_PGTABLE_PROT_PX; 1565 + 1566 + return prot; 1563 1567 } 1564 1568 1565 - static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1566 - struct kvm_s2_trans *nested, 1567 - struct kvm_memory_slot *memslot, bool is_perm) 1569 + struct kvm_s2_fault_desc { 1570 + struct kvm_vcpu *vcpu; 1571 + phys_addr_t fault_ipa; 1572 + struct kvm_s2_trans *nested; 1573 + struct kvm_memory_slot *memslot; 1574 + unsigned long hva; 1575 + }; 1576 + 1577 + static int gmem_abort(const struct kvm_s2_fault_desc *s2fd) 1568 1578 { 1569 - bool write_fault, exec_fault, writable; 1579 + bool write_fault, exec_fault; 1570 1580 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 1571 1581 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1572 - struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt; 1582 + struct kvm_pgtable *pgt = s2fd->vcpu->arch.hw_mmu->pgt; 1573 1583 unsigned long mmu_seq; 1574 1584 struct page *page; 1575 - struct kvm *kvm = vcpu->kvm; 1585 + struct kvm *kvm = s2fd->vcpu->kvm; 1576 1586 void *memcache; 1577 1587 kvm_pfn_t pfn; 1578 1588 gfn_t gfn; 1579 1589 int ret; 1580 1590 1581 - ret = prepare_mmu_memcache(vcpu, true, &memcache); 1591 + memcache = get_mmu_memcache(s2fd->vcpu); 1592 + ret = topup_mmu_memcache(s2fd->vcpu, memcache); 1582 1593 if (ret) 1583 1594 return ret; 1584 1595 1585 - if (nested) 1586 - gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT; 1596 + if (s2fd->nested) 1597 + gfn = kvm_s2_trans_output(s2fd->nested) >> PAGE_SHIFT; 1587 1598 else 1588 - gfn = fault_ipa >> PAGE_SHIFT; 1599 + gfn = s2fd->fault_ipa >> PAGE_SHIFT; 1589 1600 1590 - write_fault = kvm_is_write_fault(vcpu); 1591 - exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); 1601 + write_fault = kvm_is_write_fault(s2fd->vcpu); 1602 + exec_fault = kvm_vcpu_trap_is_exec_fault(s2fd->vcpu); 1592 1603 1593 1604 VM_WARN_ON_ONCE(write_fault && exec_fault); 1594 1605 ··· 1605 1598 /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ 1606 1599 smp_rmb(); 1607 1600 1608 - ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL); 1601 + ret = kvm_gmem_get_pfn(kvm, s2fd->memslot, gfn, &pfn, &page, NULL); 1609 1602 if (ret) { 1610 - kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE, 1603 + kvm_prepare_memory_fault_exit(s2fd->vcpu, s2fd->fault_ipa, PAGE_SIZE, 1611 1604 write_fault, exec_fault, false); 1612 1605 return ret; 1613 1606 } 1614 1607 1615 - writable = !(memslot->flags & KVM_MEM_READONLY); 1616 - 1617 - if (nested) 1618 - adjust_nested_fault_perms(nested, &prot, &writable); 1619 - 1620 - if (writable) 1608 + if (!(s2fd->memslot->flags & KVM_MEM_READONLY)) 1621 1609 prot |= KVM_PGTABLE_PROT_W; 1610 + 1611 + if (s2fd->nested) 1612 + prot = adjust_nested_fault_perms(s2fd->nested, prot); 1622 1613 1623 1614 if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) 1624 1615 prot |= KVM_PGTABLE_PROT_X; 1625 1616 1626 - if (nested) 1627 - adjust_nested_exec_perms(kvm, nested, &prot); 1617 + if (s2fd->nested) 1618 + prot = adjust_nested_exec_perms(kvm, s2fd->nested, prot); 1628 1619 1629 1620 kvm_fault_lock(kvm); 1630 1621 if (mmu_invalidate_retry(kvm, mmu_seq)) { ··· 1630 1625 goto out_unlock; 1631 1626 } 1632 1627 1633 - ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE, 1628 + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, s2fd->fault_ipa, PAGE_SIZE, 1634 1629 __pfn_to_phys(pfn), prot, 1635 1630 memcache, flags); 1636 1631 1637 1632 out_unlock: 1638 - kvm_release_faultin_page(kvm, page, !!ret, writable); 1633 + kvm_release_faultin_page(kvm, page, !!ret, prot & KVM_PGTABLE_PROT_W); 1639 1634 kvm_fault_unlock(kvm); 1640 1635 1641 - if (writable && !ret) 1642 - mark_page_dirty_in_slot(kvm, memslot, gfn); 1636 + if ((prot & KVM_PGTABLE_PROT_W) && !ret) 1637 + mark_page_dirty_in_slot(kvm, s2fd->memslot, gfn); 1643 1638 1644 1639 return ret != -EAGAIN ? ret : 0; 1645 1640 } 1646 1641 1647 - static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1648 - struct kvm_s2_trans *nested, 1649 - struct kvm_memory_slot *memslot, unsigned long hva, 1650 - bool fault_is_perm) 1642 + struct kvm_s2_fault_vma_info { 1643 + unsigned long mmu_seq; 1644 + long vma_pagesize; 1645 + vm_flags_t vm_flags; 1646 + unsigned long max_map_size; 1647 + struct page *page; 1648 + kvm_pfn_t pfn; 1649 + gfn_t gfn; 1650 + bool device; 1651 + bool mte_allowed; 1652 + bool is_vma_cacheable; 1653 + bool map_writable; 1654 + bool map_non_cacheable; 1655 + }; 1656 + 1657 + static short kvm_s2_resolve_vma_size(const struct kvm_s2_fault_desc *s2fd, 1658 + struct kvm_s2_fault_vma_info *s2vi, 1659 + struct vm_area_struct *vma) 1651 1660 { 1652 - int ret = 0; 1653 - bool topup_memcache; 1654 - bool write_fault, writable; 1655 - bool exec_fault, mte_allowed, is_vma_cacheable; 1656 - bool s2_force_noncacheable = false, vfio_allow_any_uc = false; 1657 - unsigned long mmu_seq; 1658 - phys_addr_t ipa = fault_ipa; 1659 - struct kvm *kvm = vcpu->kvm; 1660 - struct vm_area_struct *vma; 1661 1661 short vma_shift; 1662 - void *memcache; 1663 - gfn_t gfn; 1664 - kvm_pfn_t pfn; 1665 - bool logging_active = memslot_is_logging(memslot); 1666 - bool force_pte = logging_active; 1667 - long vma_pagesize, fault_granule; 1668 - enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1669 - struct kvm_pgtable *pgt; 1670 - struct page *page; 1671 - vm_flags_t vm_flags; 1672 - enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 1673 1662 1674 - if (fault_is_perm) 1675 - fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); 1676 - write_fault = kvm_is_write_fault(vcpu); 1677 - exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); 1678 - VM_WARN_ON_ONCE(write_fault && exec_fault); 1679 - 1680 - /* 1681 - * Permission faults just need to update the existing leaf entry, 1682 - * and so normally don't require allocations from the memcache. The 1683 - * only exception to this is when dirty logging is enabled at runtime 1684 - * and a write fault needs to collapse a block entry into a table. 1685 - */ 1686 - topup_memcache = !fault_is_perm || (logging_active && write_fault); 1687 - ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache); 1688 - if (ret) 1689 - return ret; 1690 - 1691 - /* 1692 - * Let's check if we will get back a huge page backed by hugetlbfs, or 1693 - * get block mapping for device MMIO region. 1694 - */ 1695 - mmap_read_lock(current->mm); 1696 - vma = vma_lookup(current->mm, hva); 1697 - if (unlikely(!vma)) { 1698 - kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 1699 - mmap_read_unlock(current->mm); 1700 - return -EFAULT; 1701 - } 1702 - 1703 - if (force_pte) 1663 + if (memslot_is_logging(s2fd->memslot)) { 1664 + s2vi->max_map_size = PAGE_SIZE; 1704 1665 vma_shift = PAGE_SHIFT; 1705 - else 1706 - vma_shift = get_vma_page_shift(vma, hva); 1666 + } else { 1667 + s2vi->max_map_size = PUD_SIZE; 1668 + vma_shift = get_vma_page_shift(vma, s2fd->hva); 1669 + } 1707 1670 1708 1671 switch (vma_shift) { 1709 1672 #ifndef __PAGETABLE_PMD_FOLDED 1710 1673 case PUD_SHIFT: 1711 - if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) 1674 + if (fault_supports_stage2_huge_mapping(s2fd->memslot, s2fd->hva, PUD_SIZE)) 1712 1675 break; 1713 1676 fallthrough; 1714 1677 #endif ··· 1684 1711 vma_shift = PMD_SHIFT; 1685 1712 fallthrough; 1686 1713 case PMD_SHIFT: 1687 - if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) 1714 + if (fault_supports_stage2_huge_mapping(s2fd->memslot, s2fd->hva, PMD_SIZE)) 1688 1715 break; 1689 1716 fallthrough; 1690 1717 case CONT_PTE_SHIFT: 1691 1718 vma_shift = PAGE_SHIFT; 1692 - force_pte = true; 1719 + s2vi->max_map_size = PAGE_SIZE; 1693 1720 fallthrough; 1694 1721 case PAGE_SHIFT: 1695 1722 break; ··· 1697 1724 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); 1698 1725 } 1699 1726 1700 - vma_pagesize = 1UL << vma_shift; 1701 - 1702 - if (nested) { 1727 + if (s2fd->nested) { 1703 1728 unsigned long max_map_size; 1704 1729 1705 - max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; 1706 - 1707 - ipa = kvm_s2_trans_output(nested); 1730 + max_map_size = min(s2vi->max_map_size, PUD_SIZE); 1708 1731 1709 1732 /* 1710 1733 * If we're about to create a shadow stage 2 entry, then we 1711 1734 * can only create a block mapping if the guest stage 2 page 1712 1735 * table uses at least as big a mapping. 1713 1736 */ 1714 - max_map_size = min(kvm_s2_trans_size(nested), max_map_size); 1737 + max_map_size = min(kvm_s2_trans_size(s2fd->nested), max_map_size); 1715 1738 1716 1739 /* 1717 1740 * Be careful that if the mapping size falls between ··· 1718 1749 else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) 1719 1750 max_map_size = PAGE_SIZE; 1720 1751 1721 - force_pte = (max_map_size == PAGE_SIZE); 1722 - vma_pagesize = min_t(long, vma_pagesize, max_map_size); 1723 - vma_shift = __ffs(vma_pagesize); 1752 + s2vi->max_map_size = max_map_size; 1753 + vma_shift = min_t(short, vma_shift, __ffs(max_map_size)); 1724 1754 } 1755 + 1756 + return vma_shift; 1757 + } 1758 + 1759 + static bool kvm_s2_fault_is_perm(const struct kvm_s2_fault_desc *s2fd) 1760 + { 1761 + return kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); 1762 + } 1763 + 1764 + static int kvm_s2_fault_get_vma_info(const struct kvm_s2_fault_desc *s2fd, 1765 + struct kvm_s2_fault_vma_info *s2vi) 1766 + { 1767 + struct vm_area_struct *vma; 1768 + struct kvm *kvm = s2fd->vcpu->kvm; 1769 + 1770 + mmap_read_lock(current->mm); 1771 + vma = vma_lookup(current->mm, s2fd->hva); 1772 + if (unlikely(!vma)) { 1773 + kvm_err("Failed to find VMA for hva 0x%lx\n", s2fd->hva); 1774 + mmap_read_unlock(current->mm); 1775 + return -EFAULT; 1776 + } 1777 + 1778 + s2vi->vma_pagesize = BIT(kvm_s2_resolve_vma_size(s2fd, s2vi, vma)); 1725 1779 1726 1780 /* 1727 1781 * Both the canonical IPA and fault IPA must be aligned to the 1728 1782 * mapping size to ensure we find the right PFN and lay down the 1729 1783 * mapping in the right place. 1730 1784 */ 1731 - fault_ipa = ALIGN_DOWN(fault_ipa, vma_pagesize); 1732 - ipa = ALIGN_DOWN(ipa, vma_pagesize); 1785 + s2vi->gfn = ALIGN_DOWN(s2fd->fault_ipa, s2vi->vma_pagesize) >> PAGE_SHIFT; 1733 1786 1734 - gfn = ipa >> PAGE_SHIFT; 1735 - mte_allowed = kvm_vma_mte_allowed(vma); 1787 + s2vi->mte_allowed = kvm_vma_mte_allowed(vma); 1736 1788 1737 - vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; 1789 + s2vi->vm_flags = vma->vm_flags; 1738 1790 1739 - vm_flags = vma->vm_flags; 1740 - 1741 - is_vma_cacheable = kvm_vma_is_cacheable(vma); 1742 - 1743 - /* Don't use the VMA after the unlock -- it may have vanished */ 1744 - vma = NULL; 1791 + s2vi->is_vma_cacheable = kvm_vma_is_cacheable(vma); 1745 1792 1746 1793 /* 1747 1794 * Read mmu_invalidate_seq so that KVM can detect if the results of ··· 1767 1782 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs 1768 1783 * with the smp_wmb() in kvm_mmu_invalidate_end(). 1769 1784 */ 1770 - mmu_seq = kvm->mmu_invalidate_seq; 1785 + s2vi->mmu_seq = kvm->mmu_invalidate_seq; 1771 1786 mmap_read_unlock(current->mm); 1772 1787 1773 - pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, 1774 - &writable, &page); 1775 - if (pfn == KVM_PFN_ERR_HWPOISON) { 1776 - kvm_send_hwpoison_signal(hva, vma_shift); 1777 - return 0; 1778 - } 1779 - if (is_error_noslot_pfn(pfn)) 1788 + return 0; 1789 + } 1790 + 1791 + static gfn_t get_canonical_gfn(const struct kvm_s2_fault_desc *s2fd, 1792 + const struct kvm_s2_fault_vma_info *s2vi) 1793 + { 1794 + phys_addr_t ipa; 1795 + 1796 + if (!s2fd->nested) 1797 + return s2vi->gfn; 1798 + 1799 + ipa = kvm_s2_trans_output(s2fd->nested); 1800 + return ALIGN_DOWN(ipa, s2vi->vma_pagesize) >> PAGE_SHIFT; 1801 + } 1802 + 1803 + static int kvm_s2_fault_pin_pfn(const struct kvm_s2_fault_desc *s2fd, 1804 + struct kvm_s2_fault_vma_info *s2vi) 1805 + { 1806 + int ret; 1807 + 1808 + ret = kvm_s2_fault_get_vma_info(s2fd, s2vi); 1809 + if (ret) 1810 + return ret; 1811 + 1812 + s2vi->pfn = __kvm_faultin_pfn(s2fd->memslot, get_canonical_gfn(s2fd, s2vi), 1813 + kvm_is_write_fault(s2fd->vcpu) ? FOLL_WRITE : 0, 1814 + &s2vi->map_writable, &s2vi->page); 1815 + if (unlikely(is_error_noslot_pfn(s2vi->pfn))) { 1816 + if (s2vi->pfn == KVM_PFN_ERR_HWPOISON) { 1817 + kvm_send_hwpoison_signal(s2fd->hva, __ffs(s2vi->vma_pagesize)); 1818 + return 0; 1819 + } 1780 1820 return -EFAULT; 1821 + } 1781 1822 1782 1823 /* 1783 1824 * Check if this is non-struct page memory PFN, and cannot support 1784 1825 * CMOs. It could potentially be unsafe to access as cacheable. 1785 1826 */ 1786 - if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) { 1787 - if (is_vma_cacheable) { 1827 + if (s2vi->vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(s2vi->pfn)) { 1828 + if (s2vi->is_vma_cacheable) { 1788 1829 /* 1789 1830 * Whilst the VMA owner expects cacheable mapping to this 1790 1831 * PFN, hardware also has to support the FWB and CACHE DIC ··· 1823 1812 * S2FWB and CACHE DIC are mandatory to avoid the need for 1824 1813 * cache maintenance. 1825 1814 */ 1826 - if (!kvm_supports_cacheable_pfnmap()) 1827 - ret = -EFAULT; 1815 + if (!kvm_supports_cacheable_pfnmap()) { 1816 + kvm_release_faultin_page(s2fd->vcpu->kvm, s2vi->page, true, false); 1817 + return -EFAULT; 1818 + } 1828 1819 } else { 1829 1820 /* 1830 1821 * If the page was identified as device early by looking at ··· 1838 1825 * In both cases, we don't let transparent_hugepage_adjust() 1839 1826 * change things at the last minute. 1840 1827 */ 1841 - s2_force_noncacheable = true; 1828 + s2vi->map_non_cacheable = true; 1842 1829 } 1843 - } else if (logging_active && !write_fault) { 1844 - /* 1845 - * Only actually map the page as writable if this was a write 1846 - * fault. 1847 - */ 1848 - writable = false; 1830 + 1831 + s2vi->device = true; 1849 1832 } 1850 1833 1851 - if (exec_fault && s2_force_noncacheable) 1852 - ret = -ENOEXEC; 1834 + return 1; 1835 + } 1853 1836 1854 - if (ret) 1855 - goto out_put_page; 1837 + static int kvm_s2_fault_compute_prot(const struct kvm_s2_fault_desc *s2fd, 1838 + const struct kvm_s2_fault_vma_info *s2vi, 1839 + enum kvm_pgtable_prot *prot) 1840 + { 1841 + struct kvm *kvm = s2fd->vcpu->kvm; 1842 + 1843 + if (kvm_vcpu_trap_is_exec_fault(s2fd->vcpu) && s2vi->map_non_cacheable) 1844 + return -ENOEXEC; 1856 1845 1857 1846 /* 1858 1847 * Guest performs atomic/exclusive operations on memory with unsupported ··· 1862 1847 * and trigger the exception here. Since the memslot is valid, inject 1863 1848 * the fault back to the guest. 1864 1849 */ 1865 - if (esr_fsc_is_excl_atomic_fault(kvm_vcpu_get_esr(vcpu))) { 1866 - kvm_inject_dabt_excl_atomic(vcpu, kvm_vcpu_get_hfar(vcpu)); 1867 - ret = 1; 1868 - goto out_put_page; 1850 + if (esr_fsc_is_excl_atomic_fault(kvm_vcpu_get_esr(s2fd->vcpu))) { 1851 + kvm_inject_dabt_excl_atomic(s2fd->vcpu, kvm_vcpu_get_hfar(s2fd->vcpu)); 1852 + return 1; 1869 1853 } 1870 1854 1871 - if (nested) 1872 - adjust_nested_fault_perms(nested, &prot, &writable); 1855 + *prot = KVM_PGTABLE_PROT_R; 1856 + 1857 + if (s2vi->map_writable && (s2vi->device || 1858 + !memslot_is_logging(s2fd->memslot) || 1859 + kvm_is_write_fault(s2fd->vcpu))) 1860 + *prot |= KVM_PGTABLE_PROT_W; 1861 + 1862 + if (s2fd->nested) 1863 + *prot = adjust_nested_fault_perms(s2fd->nested, *prot); 1864 + 1865 + if (kvm_vcpu_trap_is_exec_fault(s2fd->vcpu)) 1866 + *prot |= KVM_PGTABLE_PROT_X; 1867 + 1868 + if (s2vi->map_non_cacheable) 1869 + *prot |= (s2vi->vm_flags & VM_ALLOW_ANY_UNCACHED) ? 1870 + KVM_PGTABLE_PROT_NORMAL_NC : KVM_PGTABLE_PROT_DEVICE; 1871 + else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) 1872 + *prot |= KVM_PGTABLE_PROT_X; 1873 + 1874 + if (s2fd->nested) 1875 + *prot = adjust_nested_exec_perms(kvm, s2fd->nested, *prot); 1876 + 1877 + if (!kvm_s2_fault_is_perm(s2fd) && !s2vi->map_non_cacheable && kvm_has_mte(kvm)) { 1878 + /* Check the VMM hasn't introduced a new disallowed VMA */ 1879 + if (!s2vi->mte_allowed) 1880 + return -EFAULT; 1881 + } 1882 + 1883 + return 0; 1884 + } 1885 + 1886 + static int kvm_s2_fault_map(const struct kvm_s2_fault_desc *s2fd, 1887 + const struct kvm_s2_fault_vma_info *s2vi, 1888 + enum kvm_pgtable_prot prot, 1889 + void *memcache) 1890 + { 1891 + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; 1892 + bool writable = prot & KVM_PGTABLE_PROT_W; 1893 + struct kvm *kvm = s2fd->vcpu->kvm; 1894 + struct kvm_pgtable *pgt; 1895 + long perm_fault_granule; 1896 + long mapping_size; 1897 + kvm_pfn_t pfn; 1898 + gfn_t gfn; 1899 + int ret; 1873 1900 1874 1901 kvm_fault_lock(kvm); 1875 - pgt = vcpu->arch.hw_mmu->pgt; 1876 - if (mmu_invalidate_retry(kvm, mmu_seq)) { 1877 - ret = -EAGAIN; 1902 + pgt = s2fd->vcpu->arch.hw_mmu->pgt; 1903 + ret = -EAGAIN; 1904 + if (mmu_invalidate_retry(kvm, s2vi->mmu_seq)) 1878 1905 goto out_unlock; 1879 - } 1906 + 1907 + perm_fault_granule = (kvm_s2_fault_is_perm(s2fd) ? 1908 + kvm_vcpu_trap_get_perm_fault_granule(s2fd->vcpu) : 0); 1909 + mapping_size = s2vi->vma_pagesize; 1910 + pfn = s2vi->pfn; 1911 + gfn = s2vi->gfn; 1880 1912 1881 1913 /* 1882 1914 * If we are not forced to use page mapping, check if we are 1883 1915 * backed by a THP and thus use block mapping if possible. 1884 1916 */ 1885 - if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) { 1886 - if (fault_is_perm && fault_granule > PAGE_SIZE) 1887 - vma_pagesize = fault_granule; 1888 - else 1889 - vma_pagesize = transparent_hugepage_adjust(kvm, memslot, 1890 - hva, &pfn, 1891 - &fault_ipa); 1892 - 1893 - if (vma_pagesize < 0) { 1894 - ret = vma_pagesize; 1895 - goto out_unlock; 1896 - } 1897 - } 1898 - 1899 - if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) { 1900 - /* Check the VMM hasn't introduced a new disallowed VMA */ 1901 - if (mte_allowed) { 1902 - sanitise_mte_tags(kvm, pfn, vma_pagesize); 1917 + if (mapping_size == PAGE_SIZE && 1918 + !(s2vi->max_map_size == PAGE_SIZE || s2vi->map_non_cacheable)) { 1919 + if (perm_fault_granule > PAGE_SIZE) { 1920 + mapping_size = perm_fault_granule; 1903 1921 } else { 1904 - ret = -EFAULT; 1905 - goto out_unlock; 1922 + mapping_size = transparent_hugepage_adjust(kvm, s2fd->memslot, 1923 + s2fd->hva, &pfn, 1924 + &gfn); 1925 + if (mapping_size < 0) { 1926 + ret = mapping_size; 1927 + goto out_unlock; 1928 + } 1906 1929 } 1907 1930 } 1908 1931 1909 - if (writable) 1910 - prot |= KVM_PGTABLE_PROT_W; 1911 - 1912 - if (exec_fault) 1913 - prot |= KVM_PGTABLE_PROT_X; 1914 - 1915 - if (s2_force_noncacheable) { 1916 - if (vfio_allow_any_uc) 1917 - prot |= KVM_PGTABLE_PROT_NORMAL_NC; 1918 - else 1919 - prot |= KVM_PGTABLE_PROT_DEVICE; 1920 - } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) { 1921 - prot |= KVM_PGTABLE_PROT_X; 1922 - } 1923 - 1924 - if (nested) 1925 - adjust_nested_exec_perms(kvm, nested, &prot); 1932 + if (!perm_fault_granule && !s2vi->map_non_cacheable && kvm_has_mte(kvm)) 1933 + sanitise_mte_tags(kvm, pfn, mapping_size); 1926 1934 1927 1935 /* 1928 1936 * Under the premise of getting a FSC_PERM fault, we just need to relax 1929 - * permissions only if vma_pagesize equals fault_granule. Otherwise, 1937 + * permissions only if mapping_size equals perm_fault_granule. Otherwise, 1930 1938 * kvm_pgtable_stage2_map() should be called to change block size. 1931 1939 */ 1932 - if (fault_is_perm && vma_pagesize == fault_granule) { 1940 + if (mapping_size == perm_fault_granule) { 1933 1941 /* 1934 1942 * Drop the SW bits in favour of those stored in the 1935 1943 * PTE, which will be preserved. 1936 1944 */ 1937 1945 prot &= ~KVM_NV_GUEST_MAP_SZ; 1938 - ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); 1946 + ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, gfn_to_gpa(gfn), 1947 + prot, flags); 1939 1948 } else { 1940 - ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, 1941 - __pfn_to_phys(pfn), prot, 1942 - memcache, flags); 1949 + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, gfn_to_gpa(gfn), mapping_size, 1950 + __pfn_to_phys(pfn), prot, 1951 + memcache, flags); 1943 1952 } 1944 1953 1945 1954 out_unlock: 1946 - kvm_release_faultin_page(kvm, page, !!ret, writable); 1955 + kvm_release_faultin_page(kvm, s2vi->page, !!ret, writable); 1947 1956 kvm_fault_unlock(kvm); 1948 1957 1949 - /* Mark the page dirty only if the fault is handled successfully */ 1950 - if (writable && !ret) 1951 - mark_page_dirty_in_slot(kvm, memslot, gfn); 1958 + /* 1959 + * Mark the page dirty only if the fault is handled successfully, 1960 + * making sure we adjust the canonical IPA if the mapping size has 1961 + * been updated (via a THP upgrade, for example). 1962 + */ 1963 + if (writable && !ret) { 1964 + phys_addr_t ipa = gfn_to_gpa(get_canonical_gfn(s2fd, s2vi)); 1965 + ipa &= ~(mapping_size - 1); 1966 + mark_page_dirty_in_slot(kvm, s2fd->memslot, gpa_to_gfn(ipa)); 1967 + } 1952 1968 1953 - return ret != -EAGAIN ? ret : 0; 1969 + if (ret != -EAGAIN) 1970 + return ret; 1971 + return 0; 1972 + } 1954 1973 1955 - out_put_page: 1956 - kvm_release_page_unused(page); 1957 - return ret; 1974 + static int user_mem_abort(const struct kvm_s2_fault_desc *s2fd) 1975 + { 1976 + bool perm_fault = kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); 1977 + struct kvm_s2_fault_vma_info s2vi = {}; 1978 + enum kvm_pgtable_prot prot; 1979 + void *memcache; 1980 + int ret; 1981 + 1982 + /* 1983 + * Permission faults just need to update the existing leaf entry, 1984 + * and so normally don't require allocations from the memcache. The 1985 + * only exception to this is when dirty logging is enabled at runtime 1986 + * and a write fault needs to collapse a block entry into a table. 1987 + */ 1988 + memcache = get_mmu_memcache(s2fd->vcpu); 1989 + if (!perm_fault || (memslot_is_logging(s2fd->memslot) && 1990 + kvm_is_write_fault(s2fd->vcpu))) { 1991 + ret = topup_mmu_memcache(s2fd->vcpu, memcache); 1992 + if (ret) 1993 + return ret; 1994 + } 1995 + 1996 + /* 1997 + * Let's check if we will get back a huge page backed by hugetlbfs, or 1998 + * get block mapping for device MMIO region. 1999 + */ 2000 + ret = kvm_s2_fault_pin_pfn(s2fd, &s2vi); 2001 + if (ret != 1) 2002 + return ret; 2003 + 2004 + ret = kvm_s2_fault_compute_prot(s2fd, &s2vi, &prot); 2005 + if (ret) { 2006 + kvm_release_page_unused(s2vi.page); 2007 + return ret; 2008 + } 2009 + 2010 + return kvm_s2_fault_map(s2fd, &s2vi, prot, memcache); 1958 2011 } 1959 2012 1960 2013 /* Resolve the access fault by making the page young again. */ ··· 2288 2205 VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) && 2289 2206 !write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu)); 2290 2207 2208 + const struct kvm_s2_fault_desc s2fd = { 2209 + .vcpu = vcpu, 2210 + .fault_ipa = fault_ipa, 2211 + .nested = nested, 2212 + .memslot = memslot, 2213 + .hva = hva, 2214 + }; 2215 + 2291 2216 if (kvm_slot_has_gmem(memslot)) 2292 - ret = gmem_abort(vcpu, fault_ipa, nested, memslot, 2293 - esr_fsc_is_permission_fault(esr)); 2217 + ret = gmem_abort(&s2fd); 2294 2218 else 2295 - ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, 2296 - esr_fsc_is_permission_fault(esr)); 2219 + ret = user_mem_abort(&s2fd); 2220 + 2297 2221 if (ret == 0) 2298 2222 ret = 1; 2299 2223 out: