Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'x86-mm-2024-09-17' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 memory management updates from Thomas Gleixner:

- Make LAM enablement safe vs. kernel threads using a process mm
temporarily as switching back to the process would not update CR3 and
therefore not enable LAM causing faults in user space when using
tagged pointers. Cure it by synchronizing LAM enablement via IPIs to
all CPUs which use the related mm.

- Cure a LAM harmless inconsistency between CR3 and the state during
context switch. It's both confusing and prone to lead to real bugs

- Handle alt stack handling for threads which run with a non-zero
protection key. The non-zero key prevents the kernel to access the
alternate stack. Cure it by temporarily enabling all protection keys
for the alternate stack setup/restore operations.

- Provide a EFI config table identity mapping for kexec kernel to
prevent kexec fails because the new kernel cannot access the config
table array

- Use GB pages only when a full GB is mapped in the identity map as
otherwise the CPU can speculate into reserved areas after the end of
memory which causes malfunction on UV systems.

- Remove the noisy and pointless SRAT table dump during boot

- Use is_ioremap_addr() for iounmap() address range checks instead of
high_memory. is_ioremap_addr() is more precise.

* tag 'x86-mm-2024-09-17' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/ioremap: Improve iounmap() address range checks
x86/mm: Remove duplicate check from build_cr3()
x86/mm: Remove unused NX related declarations
x86/mm: Remove unused CR3_HW_ASID_BITS
x86/mm: Don't print out SRAT table information
x86/mm/ident_map: Use gbpages only where full GB page should be mapped.
x86/kexec: Add EFI config table identity mapping for kexec kernel
selftests/mm: Add new testcases for pkeys
x86/pkeys: Restore altstack access in sigreturn()
x86/pkeys: Update PKRU to enable all pkeys before XSAVE
x86/pkeys: Add helper functions to update PKRU on the sigframe
x86/pkeys: Add PKRU as a parameter in signal handling functions
x86/mm: Cleanup prctl_enable_tagged_addr() nr_bits error checking
x86/mm: Fix LAM inconsistency during context switch
x86/mm: Use IPIs to synchronize LAM enablement

+664 -60
+1 -1
arch/x86/include/asm/fpu/signal.h
··· 29 29 30 30 unsigned long fpu__get_fpstate_size(void); 31 31 32 - extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); 32 + extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size, u32 pkru); 33 33 extern void fpu__clear_user_states(struct fpu *fpu); 34 34 extern bool fpu__restore_sig(void __user *buf, int ia32_frame); 35 35
+7 -1
arch/x86/include/asm/mmu_context.h
··· 88 88 #ifdef CONFIG_ADDRESS_MASKING 89 89 static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm) 90 90 { 91 - return mm->context.lam_cr3_mask; 91 + /* 92 + * When switch_mm_irqs_off() is called for a kthread, it may race with 93 + * LAM enablement. switch_mm_irqs_off() uses the LAM mask to do two 94 + * things: populate CR3 and populate 'cpu_tlbstate.lam'. Make sure it 95 + * reads a single value for both. 96 + */ 97 + return READ_ONCE(mm->context.lam_cr3_mask); 92 98 } 93 99 94 100 static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
-2
arch/x86/include/asm/pgtable_types.h
··· 517 517 518 518 extern pteval_t __supported_pte_mask; 519 519 extern pteval_t __default_kernel_pte_mask; 520 - extern void set_nx(void); 521 - extern int nx_enabled; 522 520 523 521 #define pgprot_writecombine pgprot_writecombine 524 522 extern pgprot_t pgprot_writecombine(pgprot_t prot);
+4 -5
arch/x86/include/asm/tlbflush.h
··· 399 399 return lam << X86_CR3_LAM_U57_BIT; 400 400 } 401 401 402 - static inline void set_tlbstate_lam_mode(struct mm_struct *mm) 402 + static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask) 403 403 { 404 - this_cpu_write(cpu_tlbstate.lam, 405 - mm->context.lam_cr3_mask >> X86_CR3_LAM_U57_BIT); 406 - this_cpu_write(tlbstate_untag_mask, mm->context.untag_mask); 404 + this_cpu_write(cpu_tlbstate.lam, lam >> X86_CR3_LAM_U57_BIT); 405 + this_cpu_write(tlbstate_untag_mask, untag_mask); 407 406 } 408 407 409 408 #else ··· 412 413 return 0; 413 414 } 414 415 415 - static inline void set_tlbstate_lam_mode(struct mm_struct *mm) 416 + static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask) 416 417 { 417 418 } 418 419 #endif
+22 -5
arch/x86/kernel/fpu/signal.c
··· 64 64 } 65 65 66 66 /* 67 + * Update the value of PKRU register that was already pushed onto the signal frame. 68 + */ 69 + static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u32 pkru) 70 + { 71 + if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE))) 72 + return 0; 73 + return __put_user(pkru, (unsigned int __user *)get_xsave_addr_user(buf, XFEATURE_PKRU)); 74 + } 75 + 76 + /* 67 77 * Signal frame handlers. 68 78 */ 69 79 static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf) ··· 166 156 return !err; 167 157 } 168 158 169 - static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) 159 + static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf, u32 pkru) 170 160 { 171 - if (use_xsave()) 172 - return xsave_to_user_sigframe(buf); 161 + int err = 0; 162 + 163 + if (use_xsave()) { 164 + err = xsave_to_user_sigframe(buf); 165 + if (!err) 166 + err = update_pkru_in_sigframe(buf, pkru); 167 + return err; 168 + } 169 + 173 170 if (use_fxsr()) 174 171 return fxsave_to_user_sigframe((struct fxregs_state __user *) buf); 175 172 else ··· 202 185 * For [f]xsave state, update the SW reserved fields in the [f]xsave frame 203 186 * indicating the absence/presence of the extended state to the user. 204 187 */ 205 - bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) 188 + bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size, u32 pkru) 206 189 { 207 190 struct task_struct *tsk = current; 208 191 struct fpstate *fpstate = tsk->thread.fpu.fpstate; ··· 245 228 fpregs_restore_userregs(); 246 229 247 230 pagefault_disable(); 248 - ret = copy_fpregs_to_sigframe(buf_fx); 231 + ret = copy_fpregs_to_sigframe(buf_fx, pkru); 249 232 pagefault_enable(); 250 233 fpregs_unlock(); 251 234
+13
arch/x86/kernel/fpu/xstate.c
··· 999 999 } 1000 1000 EXPORT_SYMBOL_GPL(get_xsave_addr); 1001 1001 1002 + /* 1003 + * Given an xstate feature nr, calculate where in the xsave buffer the state is. 1004 + * The xsave buffer should be in standard format, not compacted (e.g. user mode 1005 + * signal frames). 1006 + */ 1007 + void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr) 1008 + { 1009 + if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) 1010 + return NULL; 1011 + 1012 + return (void __user *)xsave + xstate_offsets[xfeature_nr]; 1013 + } 1014 + 1002 1015 #ifdef CONFIG_ARCH_HAS_PKEYS 1003 1016 1004 1017 /*
+2
arch/x86/kernel/fpu/xstate.h
··· 54 54 extern void fpu__init_cpu_xstate(void); 55 55 extern void fpu__init_system_xstate(unsigned int legacy_size); 56 56 57 + extern void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr); 58 + 57 59 static inline u64 xfeatures_mask_supervisor(void) 58 60 { 59 61 return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+27
arch/x86/kernel/machine_kexec_64.c
··· 28 28 #include <asm/setup.h> 29 29 #include <asm/set_memory.h> 30 30 #include <asm/cpu.h> 31 + #include <asm/efi.h> 31 32 32 33 #ifdef CONFIG_ACPI 33 34 /* ··· 88 87 { 89 88 #ifdef CONFIG_EFI 90 89 unsigned long mstart, mend; 90 + void *kaddr; 91 + int ret; 91 92 92 93 if (!efi_enabled(EFI_BOOT)) 93 94 return 0; ··· 104 101 105 102 if (!mstart) 106 103 return 0; 104 + 105 + ret = kernel_ident_mapping_init(info, level4p, mstart, mend); 106 + if (ret) 107 + return ret; 108 + 109 + kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB); 110 + if (!kaddr) { 111 + pr_err("Could not map UEFI system table\n"); 112 + return -ENOMEM; 113 + } 114 + 115 + mstart = efi_config_table; 116 + 117 + if (efi_enabled(EFI_64BIT)) { 118 + efi_system_table_64_t *stbl = (efi_system_table_64_t *)kaddr; 119 + 120 + mend = mstart + sizeof(efi_config_table_64_t) * stbl->nr_tables; 121 + } else { 122 + efi_system_table_32_t *stbl = (efi_system_table_32_t *)kaddr; 123 + 124 + mend = mstart + sizeof(efi_config_table_32_t) * stbl->nr_tables; 125 + } 126 + 127 + memunmap(kaddr); 107 128 108 129 return kernel_ident_mapping_init(info, level4p, mstart, mend); 109 130 #endif
+32 -10
arch/x86/kernel/process_64.c
··· 798 798 799 799 #define LAM_U57_BITS 6 800 800 801 + static void enable_lam_func(void *__mm) 802 + { 803 + struct mm_struct *mm = __mm; 804 + unsigned long lam; 805 + 806 + if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) { 807 + lam = mm_lam_cr3_mask(mm); 808 + write_cr3(__read_cr3() | lam); 809 + cpu_tlbstate_update_lam(lam, mm_untag_mask(mm)); 810 + } 811 + } 812 + 813 + static void mm_enable_lam(struct mm_struct *mm) 814 + { 815 + mm->context.lam_cr3_mask = X86_CR3_LAM_U57; 816 + mm->context.untag_mask = ~GENMASK(62, 57); 817 + 818 + /* 819 + * Even though the process must still be single-threaded at this 820 + * point, kernel threads may be using the mm. IPI those kernel 821 + * threads if they exist. 822 + */ 823 + on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true); 824 + set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); 825 + } 826 + 801 827 static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) 802 828 { 803 829 if (!cpu_feature_enabled(X86_FEATURE_LAM)) ··· 840 814 if (mmap_write_lock_killable(mm)) 841 815 return -EINTR; 842 816 817 + /* 818 + * MM_CONTEXT_LOCK_LAM is set on clone. Prevent LAM from 819 + * being enabled unless the process is single threaded: 820 + */ 843 821 if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) { 844 822 mmap_write_unlock(mm); 845 823 return -EBUSY; 846 824 } 847 825 848 - if (!nr_bits) { 849 - mmap_write_unlock(mm); 850 - return -EINVAL; 851 - } else if (nr_bits <= LAM_U57_BITS) { 852 - mm->context.lam_cr3_mask = X86_CR3_LAM_U57; 853 - mm->context.untag_mask = ~GENMASK(62, 57); 854 - } else { 826 + if (!nr_bits || nr_bits > LAM_U57_BITS) { 855 827 mmap_write_unlock(mm); 856 828 return -EINVAL; 857 829 } 858 830 859 - write_cr3(__read_cr3() | mm->context.lam_cr3_mask); 860 - set_tlbstate_lam_mode(mm); 861 - set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); 831 + mm_enable_lam(mm); 862 832 863 833 mmap_write_unlock(mm); 864 834
+28 -1
arch/x86/kernel/signal.c
··· 61 61 } 62 62 63 63 /* 64 + * Enable all pkeys temporarily, so as to ensure that both the current 65 + * execution stack as well as the alternate signal stack are writeable. 66 + * The application can use any of the available pkeys to protect the 67 + * alternate signal stack, and we don't know which one it is, so enable 68 + * all. The PKRU register will be reset to init_pkru later in the flow, 69 + * in fpu__clear_user_states(), and it is the application's responsibility 70 + * to enable the appropriate pkey as the first step in the signal handler 71 + * so that the handler does not segfault. 72 + */ 73 + static inline u32 sig_prepare_pkru(void) 74 + { 75 + u32 orig_pkru = read_pkru(); 76 + 77 + write_pkru(0); 78 + return orig_pkru; 79 + } 80 + 81 + /* 64 82 * Set up a signal frame. 65 83 */ 66 84 ··· 102 84 unsigned long math_size = 0; 103 85 unsigned long sp = regs->sp; 104 86 unsigned long buf_fx = 0; 87 + u32 pkru; 105 88 106 89 /* redzone */ 107 90 if (!ia32_frame) ··· 157 138 return (void __user *)-1L; 158 139 } 159 140 141 + /* Update PKRU to enable access to the alternate signal stack. */ 142 + pkru = sig_prepare_pkru(); 160 143 /* save i387 and extended state */ 161 - if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size)) 144 + if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size, pkru)) { 145 + /* 146 + * Restore PKRU to the original, user-defined value; disable 147 + * extra pkeys enabled for the alternate signal stack, if any. 148 + */ 149 + write_pkru(pkru); 162 150 return (void __user *)-1L; 151 + } 163 152 164 153 return (void __user *)sp; 165 154 }
+3 -3
arch/x86/kernel/signal_64.c
··· 260 260 261 261 set_current_blocked(&set); 262 262 263 + if (restore_altstack(&frame->uc.uc_stack)) 264 + goto badframe; 265 + 263 266 if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) 264 267 goto badframe; 265 268 266 269 if (restore_signal_shadow_stack()) 267 - goto badframe; 268 - 269 - if (restore_altstack(&frame->uc.uc_stack)) 270 270 goto badframe; 271 271 272 272 return regs->ax;
+18 -5
arch/x86/mm/ident_map.c
··· 99 99 for (; addr < end; addr = next) { 100 100 pud_t *pud = pud_page + pud_index(addr); 101 101 pmd_t *pmd; 102 + bool use_gbpage; 102 103 103 104 next = (addr & PUD_MASK) + PUD_SIZE; 104 105 if (next > end) 105 106 next = end; 106 107 107 - if (info->direct_gbpages) { 108 + /* if this is already a gbpage, this portion is already mapped */ 109 + if (pud_leaf(*pud)) 110 + continue; 111 + 112 + /* Is using a gbpage allowed? */ 113 + use_gbpage = info->direct_gbpages; 114 + 115 + /* Don't use gbpage if it maps more than the requested region. */ 116 + /* at the begining: */ 117 + use_gbpage &= ((addr & ~PUD_MASK) == 0); 118 + /* ... or at the end: */ 119 + use_gbpage &= ((next & ~PUD_MASK) == 0); 120 + 121 + /* Never overwrite existing mappings */ 122 + use_gbpage &= !pud_present(*pud); 123 + 124 + if (use_gbpage) { 108 125 pud_t pudval; 109 126 110 - if (pud_present(*pud)) 111 - continue; 112 - 113 - addr &= PUD_MASK; 114 127 pudval = __pud((addr - info->offset) | info->page_flag); 115 128 set_pud(pud, pudval); 116 129 continue;
+2 -1
arch/x86/mm/ioremap.c
··· 11 11 #include <linux/init.h> 12 12 #include <linux/io.h> 13 13 #include <linux/ioport.h> 14 + #include <linux/ioremap.h> 14 15 #include <linux/slab.h> 15 16 #include <linux/vmalloc.h> 16 17 #include <linux/mmiotrace.h> ··· 458 457 { 459 458 struct vm_struct *p, *o; 460 459 461 - if ((void __force *)addr <= high_memory) 460 + if (WARN_ON_ONCE(!is_ioremap_addr((void __force *)addr))) 462 461 return; 463 462 464 463 /*
+2 -4
arch/x86/mm/srat.c
··· 57 57 } 58 58 set_apicid_to_node(apic_id, node); 59 59 node_set(node, numa_nodes_parsed); 60 - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", 61 - pxm, apic_id, node); 60 + pr_debug("SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node); 62 61 } 63 62 64 63 /* Callback for Proximity Domain -> LAPIC mapping */ ··· 97 98 98 99 set_apicid_to_node(apic_id, node); 99 100 node_set(node, numa_nodes_parsed); 100 - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", 101 - pxm, apic_id, node); 101 + pr_debug("SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node); 102 102 } 103 103 104 104 int __init x86_acpi_numa_init(void)
+8 -11
arch/x86/mm/tlb.c
··· 11 11 #include <linux/sched/smt.h> 12 12 #include <linux/task_work.h> 13 13 #include <linux/mmu_notifier.h> 14 + #include <linux/mmu_context.h> 14 15 15 16 #include <asm/tlbflush.h> 16 17 #include <asm/mmu_context.h> ··· 86 85 * 87 86 */ 88 87 89 - /* There are 12 bits of space for ASIDS in CR3 */ 90 - #define CR3_HW_ASID_BITS 12 91 - 92 88 /* 93 89 * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for 94 90 * user/kernel switches ··· 158 160 unsigned long cr3 = __sme_pa(pgd) | lam; 159 161 160 162 if (static_cpu_has(X86_FEATURE_PCID)) { 161 - VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); 162 163 cr3 |= kern_pcid(asid); 163 164 } else { 164 165 VM_WARN_ON_ONCE(asid != 0); ··· 500 503 { 501 504 struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm); 502 505 u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 503 - unsigned long new_lam = mm_lam_cr3_mask(next); 504 506 bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); 505 507 unsigned cpu = smp_processor_id(); 508 + unsigned long new_lam; 506 509 u64 next_tlb_gen; 507 510 bool need_flush; 508 511 u16 new_asid; ··· 616 619 cpumask_clear_cpu(cpu, mm_cpumask(prev)); 617 620 } 618 621 619 - /* 620 - * Start remote flushes and then read tlb_gen. 621 - */ 622 + /* Start receiving IPIs and then read tlb_gen (and LAM below) */ 622 623 if (next != &init_mm) 623 624 cpumask_set_cpu(cpu, mm_cpumask(next)); 624 625 next_tlb_gen = atomic64_read(&next->context.tlb_gen); ··· 628 633 barrier(); 629 634 } 630 635 631 - set_tlbstate_lam_mode(next); 636 + new_lam = mm_lam_cr3_mask(next); 632 637 if (need_flush) { 633 638 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); 634 639 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); ··· 647 652 648 653 this_cpu_write(cpu_tlbstate.loaded_mm, next); 649 654 this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); 655 + cpu_tlbstate_update_lam(new_lam, mm_untag_mask(next)); 650 656 651 657 if (next != prev) { 652 658 cr4_update_pce_mm(next); ··· 694 698 int i; 695 699 struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); 696 700 u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); 701 + unsigned long lam = mm_lam_cr3_mask(mm); 697 702 unsigned long cr3 = __read_cr3(); 698 703 699 704 /* Assert that CR3 already references the right mm. */ ··· 702 705 703 706 /* LAM expected to be disabled */ 704 707 WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57)); 705 - WARN_ON(mm_lam_cr3_mask(mm)); 708 + WARN_ON(lam); 706 709 707 710 /* 708 711 * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization ··· 721 724 this_cpu_write(cpu_tlbstate.next_asid, 1); 722 725 this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); 723 726 this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); 724 - set_tlbstate_lam_mode(mm); 727 + cpu_tlbstate_update_lam(lam, mm_untag_mask(mm)); 725 728 726 729 for (i = 1; i < TLB_NR_DYN_ASIDS; i++) 727 730 this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
+1
include/linux/ioremap.h
··· 4 4 5 5 #include <linux/kasan.h> 6 6 #include <asm/pgtable.h> 7 + #include <asm/vmalloc.h> 7 8 8 9 #if defined(CONFIG_HAS_IOMEM) || defined(CONFIG_GENERIC_IOREMAP) 9 10 /*
+1
tools/testing/selftests/mm/Makefile
··· 90 90 CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie) 91 91 92 92 VMTARGETS := protection_keys 93 + VMTARGETS += pkey_sighandler_tests 93 94 BINARIES_32 := $(VMTARGETS:%=%_32) 94 95 BINARIES_64 := $(VMTARGETS:%=%_64) 95 96
+12 -1
tools/testing/selftests/mm/pkey-helpers.h
··· 79 79 } \ 80 80 } while (0) 81 81 82 - __attribute__((noinline)) int read_ptr(int *ptr); 82 + #define barrier() __asm__ __volatile__("": : :"memory") 83 + #ifndef noinline 84 + # define noinline __attribute__((noinline)) 85 + #endif 86 + 87 + noinline int read_ptr(int *ptr) 88 + { 89 + /* Keep GCC from optimizing this away somehow */ 90 + barrier(); 91 + return *ptr; 92 + } 93 + 83 94 void expected_pkey_fault(int pkey); 84 95 int sys_pkey_alloc(unsigned long flags, unsigned long init_val); 85 96 int sys_pkey_free(unsigned long pkey);
+481
tools/testing/selftests/mm/pkey_sighandler_tests.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst) 4 + * 5 + * The testcases in this file exercise various flows related to signal handling, 6 + * using an alternate signal stack, with the default pkey (pkey 0) disabled. 7 + * 8 + * Compile with: 9 + * gcc -mxsave -o pkey_sighandler_tests -O2 -g -std=gnu99 -pthread -Wall pkey_sighandler_tests.c -I../../../../tools/include -lrt -ldl -lm 10 + * gcc -mxsave -m32 -o pkey_sighandler_tests -O2 -g -std=gnu99 -pthread -Wall pkey_sighandler_tests.c -I../../../../tools/include -lrt -ldl -lm 11 + */ 12 + #define _GNU_SOURCE 13 + #define __SANE_USERSPACE_TYPES__ 14 + #include <errno.h> 15 + #include <sys/syscall.h> 16 + #include <string.h> 17 + #include <stdio.h> 18 + #include <stdint.h> 19 + #include <stdbool.h> 20 + #include <signal.h> 21 + #include <assert.h> 22 + #include <stdlib.h> 23 + #include <sys/mman.h> 24 + #include <sys/types.h> 25 + #include <sys/stat.h> 26 + #include <unistd.h> 27 + #include <pthread.h> 28 + #include <limits.h> 29 + 30 + #include "pkey-helpers.h" 31 + 32 + #define STACK_SIZE PTHREAD_STACK_MIN 33 + 34 + void expected_pkey_fault(int pkey) {} 35 + 36 + pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; 37 + pthread_cond_t cond = PTHREAD_COND_INITIALIZER; 38 + siginfo_t siginfo = {0}; 39 + 40 + /* 41 + * We need to use inline assembly instead of glibc's syscall because glibc's 42 + * syscall will attempt to access the PLT in order to call a library function 43 + * which is protected by MPK 0 which we don't have access to. 44 + */ 45 + static inline __always_inline 46 + long syscall_raw(long n, long a1, long a2, long a3, long a4, long a5, long a6) 47 + { 48 + unsigned long ret; 49 + #ifdef __x86_64__ 50 + register long r10 asm("r10") = a4; 51 + register long r8 asm("r8") = a5; 52 + register long r9 asm("r9") = a6; 53 + asm volatile ("syscall" 54 + : "=a"(ret) 55 + : "a"(n), "D"(a1), "S"(a2), "d"(a3), "r"(r10), "r"(r8), "r"(r9) 56 + : "rcx", "r11", "memory"); 57 + #elif defined __i386__ 58 + asm volatile ("int $0x80" 59 + : "=a"(ret) 60 + : "a"(n), "b"(a1), "c"(a2), "d"(a3), "S"(a4), "D"(a5) 61 + : "memory"); 62 + #else 63 + # error syscall_raw() not implemented 64 + #endif 65 + return ret; 66 + } 67 + 68 + static void sigsegv_handler(int signo, siginfo_t *info, void *ucontext) 69 + { 70 + pthread_mutex_lock(&mutex); 71 + 72 + memcpy(&siginfo, info, sizeof(siginfo_t)); 73 + 74 + pthread_cond_signal(&cond); 75 + pthread_mutex_unlock(&mutex); 76 + 77 + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); 78 + } 79 + 80 + static void sigusr1_handler(int signo, siginfo_t *info, void *ucontext) 81 + { 82 + pthread_mutex_lock(&mutex); 83 + 84 + memcpy(&siginfo, info, sizeof(siginfo_t)); 85 + 86 + pthread_cond_signal(&cond); 87 + pthread_mutex_unlock(&mutex); 88 + } 89 + 90 + static void sigusr2_handler(int signo, siginfo_t *info, void *ucontext) 91 + { 92 + /* 93 + * pkru should be the init_pkru value which enabled MPK 0 so 94 + * we can use library functions. 95 + */ 96 + printf("%s invoked.\n", __func__); 97 + } 98 + 99 + static void raise_sigusr2(void) 100 + { 101 + pid_t tid = 0; 102 + 103 + tid = syscall_raw(SYS_gettid, 0, 0, 0, 0, 0, 0); 104 + 105 + syscall_raw(SYS_tkill, tid, SIGUSR2, 0, 0, 0, 0); 106 + 107 + /* 108 + * We should return from the signal handler here and be able to 109 + * return to the interrupted thread. 110 + */ 111 + } 112 + 113 + static void *thread_segv_with_pkey0_disabled(void *ptr) 114 + { 115 + /* Disable MPK 0 (and all others too) */ 116 + __write_pkey_reg(0x55555555); 117 + 118 + /* Segfault (with SEGV_MAPERR) */ 119 + *(int *) (0x1) = 1; 120 + return NULL; 121 + } 122 + 123 + static void *thread_segv_pkuerr_stack(void *ptr) 124 + { 125 + /* Disable MPK 0 (and all others too) */ 126 + __write_pkey_reg(0x55555555); 127 + 128 + /* After we disable MPK 0, we can't access the stack to return */ 129 + return NULL; 130 + } 131 + 132 + static void *thread_segv_maperr_ptr(void *ptr) 133 + { 134 + stack_t *stack = ptr; 135 + int *bad = (int *)1; 136 + 137 + /* 138 + * Setup alternate signal stack, which should be pkey_mprotect()ed by 139 + * MPK 0. The thread's stack cannot be used for signals because it is 140 + * not accessible by the default init_pkru value of 0x55555554. 141 + */ 142 + syscall_raw(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0); 143 + 144 + /* Disable MPK 0. Only MPK 1 is enabled. */ 145 + __write_pkey_reg(0x55555551); 146 + 147 + /* Segfault */ 148 + *bad = 1; 149 + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); 150 + return NULL; 151 + } 152 + 153 + /* 154 + * Verify that the sigsegv handler is invoked when pkey 0 is disabled. 155 + * Note that the new thread stack and the alternate signal stack is 156 + * protected by MPK 0. 157 + */ 158 + static void test_sigsegv_handler_with_pkey0_disabled(void) 159 + { 160 + struct sigaction sa; 161 + pthread_attr_t attr; 162 + pthread_t thr; 163 + 164 + sa.sa_flags = SA_SIGINFO; 165 + 166 + sa.sa_sigaction = sigsegv_handler; 167 + sigemptyset(&sa.sa_mask); 168 + if (sigaction(SIGSEGV, &sa, NULL) == -1) { 169 + perror("sigaction"); 170 + exit(EXIT_FAILURE); 171 + } 172 + 173 + memset(&siginfo, 0, sizeof(siginfo)); 174 + 175 + pthread_attr_init(&attr); 176 + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); 177 + 178 + pthread_create(&thr, &attr, thread_segv_with_pkey0_disabled, NULL); 179 + 180 + pthread_mutex_lock(&mutex); 181 + while (siginfo.si_signo == 0) 182 + pthread_cond_wait(&cond, &mutex); 183 + pthread_mutex_unlock(&mutex); 184 + 185 + ksft_test_result(siginfo.si_signo == SIGSEGV && 186 + siginfo.si_code == SEGV_MAPERR && 187 + siginfo.si_addr == (void *)1, 188 + "%s\n", __func__); 189 + } 190 + 191 + /* 192 + * Verify that the sigsegv handler is invoked when pkey 0 is disabled. 193 + * Note that the new thread stack and the alternate signal stack is 194 + * protected by MPK 0, which renders them inaccessible when MPK 0 195 + * is disabled. So just the return from the thread should cause a 196 + * segfault with SEGV_PKUERR. 197 + */ 198 + static void test_sigsegv_handler_cannot_access_stack(void) 199 + { 200 + struct sigaction sa; 201 + pthread_attr_t attr; 202 + pthread_t thr; 203 + 204 + sa.sa_flags = SA_SIGINFO; 205 + 206 + sa.sa_sigaction = sigsegv_handler; 207 + sigemptyset(&sa.sa_mask); 208 + if (sigaction(SIGSEGV, &sa, NULL) == -1) { 209 + perror("sigaction"); 210 + exit(EXIT_FAILURE); 211 + } 212 + 213 + memset(&siginfo, 0, sizeof(siginfo)); 214 + 215 + pthread_attr_init(&attr); 216 + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); 217 + 218 + pthread_create(&thr, &attr, thread_segv_pkuerr_stack, NULL); 219 + 220 + pthread_mutex_lock(&mutex); 221 + while (siginfo.si_signo == 0) 222 + pthread_cond_wait(&cond, &mutex); 223 + pthread_mutex_unlock(&mutex); 224 + 225 + ksft_test_result(siginfo.si_signo == SIGSEGV && 226 + siginfo.si_code == SEGV_PKUERR, 227 + "%s\n", __func__); 228 + } 229 + 230 + /* 231 + * Verify that the sigsegv handler that uses an alternate signal stack 232 + * is correctly invoked for a thread which uses a non-zero MPK to protect 233 + * its own stack, and disables all other MPKs (including 0). 234 + */ 235 + static void test_sigsegv_handler_with_different_pkey_for_stack(void) 236 + { 237 + struct sigaction sa; 238 + static stack_t sigstack; 239 + void *stack; 240 + int pkey; 241 + int parent_pid = 0; 242 + int child_pid = 0; 243 + 244 + sa.sa_flags = SA_SIGINFO | SA_ONSTACK; 245 + 246 + sa.sa_sigaction = sigsegv_handler; 247 + 248 + sigemptyset(&sa.sa_mask); 249 + if (sigaction(SIGSEGV, &sa, NULL) == -1) { 250 + perror("sigaction"); 251 + exit(EXIT_FAILURE); 252 + } 253 + 254 + stack = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE, 255 + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 256 + 257 + assert(stack != MAP_FAILED); 258 + 259 + /* Allow access to MPK 0 and MPK 1 */ 260 + __write_pkey_reg(0x55555550); 261 + 262 + /* Protect the new stack with MPK 1 */ 263 + pkey = pkey_alloc(0, 0); 264 + pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); 265 + 266 + /* Set up alternate signal stack that will use the default MPK */ 267 + sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, 268 + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 269 + sigstack.ss_flags = 0; 270 + sigstack.ss_size = STACK_SIZE; 271 + 272 + memset(&siginfo, 0, sizeof(siginfo)); 273 + 274 + /* Use clone to avoid newer glibcs using rseq on new threads */ 275 + long ret = syscall_raw(SYS_clone, 276 + CLONE_VM | CLONE_FS | CLONE_FILES | 277 + CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | 278 + CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID | 279 + CLONE_DETACHED, 280 + (long) ((char *)(stack) + STACK_SIZE), 281 + (long) &parent_pid, 282 + (long) &child_pid, 0, 0); 283 + 284 + if (ret < 0) { 285 + errno = -ret; 286 + perror("clone"); 287 + } else if (ret == 0) { 288 + thread_segv_maperr_ptr(&sigstack); 289 + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); 290 + } 291 + 292 + pthread_mutex_lock(&mutex); 293 + while (siginfo.si_signo == 0) 294 + pthread_cond_wait(&cond, &mutex); 295 + pthread_mutex_unlock(&mutex); 296 + 297 + ksft_test_result(siginfo.si_signo == SIGSEGV && 298 + siginfo.si_code == SEGV_MAPERR && 299 + siginfo.si_addr == (void *)1, 300 + "%s\n", __func__); 301 + } 302 + 303 + /* 304 + * Verify that the PKRU value set by the application is correctly 305 + * restored upon return from signal handling. 306 + */ 307 + static void test_pkru_preserved_after_sigusr1(void) 308 + { 309 + struct sigaction sa; 310 + unsigned long pkru = 0x45454544; 311 + 312 + sa.sa_flags = SA_SIGINFO; 313 + 314 + sa.sa_sigaction = sigusr1_handler; 315 + sigemptyset(&sa.sa_mask); 316 + if (sigaction(SIGUSR1, &sa, NULL) == -1) { 317 + perror("sigaction"); 318 + exit(EXIT_FAILURE); 319 + } 320 + 321 + memset(&siginfo, 0, sizeof(siginfo)); 322 + 323 + __write_pkey_reg(pkru); 324 + 325 + raise(SIGUSR1); 326 + 327 + pthread_mutex_lock(&mutex); 328 + while (siginfo.si_signo == 0) 329 + pthread_cond_wait(&cond, &mutex); 330 + pthread_mutex_unlock(&mutex); 331 + 332 + /* Ensure the pkru value is the same after returning from signal. */ 333 + ksft_test_result(pkru == __read_pkey_reg() && 334 + siginfo.si_signo == SIGUSR1, 335 + "%s\n", __func__); 336 + } 337 + 338 + static noinline void *thread_sigusr2_self(void *ptr) 339 + { 340 + /* 341 + * A const char array like "Resuming after SIGUSR2" won't be stored on 342 + * the stack and the code could access it via an offset from the program 343 + * counter. This makes sure it's on the function's stack frame. 344 + */ 345 + char str[] = {'R', 'e', 's', 'u', 'm', 'i', 'n', 'g', ' ', 346 + 'a', 'f', 't', 'e', 'r', ' ', 347 + 'S', 'I', 'G', 'U', 'S', 'R', '2', 348 + '.', '.', '.', '\n', '\0'}; 349 + stack_t *stack = ptr; 350 + 351 + /* 352 + * Setup alternate signal stack, which should be pkey_mprotect()ed by 353 + * MPK 0. The thread's stack cannot be used for signals because it is 354 + * not accessible by the default init_pkru value of 0x55555554. 355 + */ 356 + syscall(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0); 357 + 358 + /* Disable MPK 0. Only MPK 2 is enabled. */ 359 + __write_pkey_reg(0x55555545); 360 + 361 + raise_sigusr2(); 362 + 363 + /* Do something, to show the thread resumed execution after the signal */ 364 + syscall_raw(SYS_write, 1, (long) str, sizeof(str) - 1, 0, 0, 0); 365 + 366 + /* 367 + * We can't return to test_pkru_sigreturn because it 368 + * will attempt to use a %rbp value which is on the stack 369 + * of the main thread. 370 + */ 371 + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); 372 + return NULL; 373 + } 374 + 375 + /* 376 + * Verify that sigreturn is able to restore altstack even if the thread had 377 + * disabled pkey 0. 378 + */ 379 + static void test_pkru_sigreturn(void) 380 + { 381 + struct sigaction sa = {0}; 382 + static stack_t sigstack; 383 + void *stack; 384 + int pkey; 385 + int parent_pid = 0; 386 + int child_pid = 0; 387 + 388 + sa.sa_handler = SIG_DFL; 389 + sa.sa_flags = 0; 390 + sigemptyset(&sa.sa_mask); 391 + 392 + /* 393 + * For this testcase, we do not want to handle SIGSEGV. Reset handler 394 + * to default so that the application can crash if it receives SIGSEGV. 395 + */ 396 + if (sigaction(SIGSEGV, &sa, NULL) == -1) { 397 + perror("sigaction"); 398 + exit(EXIT_FAILURE); 399 + } 400 + 401 + sa.sa_flags = SA_SIGINFO | SA_ONSTACK; 402 + sa.sa_sigaction = sigusr2_handler; 403 + sigemptyset(&sa.sa_mask); 404 + 405 + if (sigaction(SIGUSR2, &sa, NULL) == -1) { 406 + perror("sigaction"); 407 + exit(EXIT_FAILURE); 408 + } 409 + 410 + stack = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE, 411 + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 412 + 413 + assert(stack != MAP_FAILED); 414 + 415 + /* 416 + * Allow access to MPK 0 and MPK 2. The child thread (to be created 417 + * later in this flow) will have its stack protected by MPK 2, whereas 418 + * the current thread's stack is protected by the default MPK 0. Hence 419 + * both need to be enabled. 420 + */ 421 + __write_pkey_reg(0x55555544); 422 + 423 + /* Protect the stack with MPK 2 */ 424 + pkey = pkey_alloc(0, 0); 425 + pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); 426 + 427 + /* Set up alternate signal stack that will use the default MPK */ 428 + sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, 429 + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 430 + sigstack.ss_flags = 0; 431 + sigstack.ss_size = STACK_SIZE; 432 + 433 + /* Use clone to avoid newer glibcs using rseq on new threads */ 434 + long ret = syscall_raw(SYS_clone, 435 + CLONE_VM | CLONE_FS | CLONE_FILES | 436 + CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | 437 + CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID | 438 + CLONE_DETACHED, 439 + (long) ((char *)(stack) + STACK_SIZE), 440 + (long) &parent_pid, 441 + (long) &child_pid, 0, 0); 442 + 443 + if (ret < 0) { 444 + errno = -ret; 445 + perror("clone"); 446 + } else if (ret == 0) { 447 + thread_sigusr2_self(&sigstack); 448 + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); 449 + } 450 + 451 + child_pid = ret; 452 + /* Check that thread exited */ 453 + do { 454 + sched_yield(); 455 + ret = syscall_raw(SYS_tkill, child_pid, 0, 0, 0, 0, 0); 456 + } while (ret != -ESRCH && ret != -EINVAL); 457 + 458 + ksft_test_result_pass("%s\n", __func__); 459 + } 460 + 461 + static void (*pkey_tests[])(void) = { 462 + test_sigsegv_handler_with_pkey0_disabled, 463 + test_sigsegv_handler_cannot_access_stack, 464 + test_sigsegv_handler_with_different_pkey_for_stack, 465 + test_pkru_preserved_after_sigusr1, 466 + test_pkru_sigreturn 467 + }; 468 + 469 + int main(int argc, char *argv[]) 470 + { 471 + int i; 472 + 473 + ksft_print_header(); 474 + ksft_set_plan(ARRAY_SIZE(pkey_tests)); 475 + 476 + for (i = 0; i < ARRAY_SIZE(pkey_tests); i++) 477 + (*pkey_tests[i])(); 478 + 479 + ksft_finished(); 480 + return 0; 481 + }
-10
tools/testing/selftests/mm/protection_keys.c
··· 954 954 nr_test_fds = 0; 955 955 } 956 956 957 - #define barrier() __asm__ __volatile__("": : :"memory") 958 - __attribute__((noinline)) int read_ptr(int *ptr) 959 - { 960 - /* 961 - * Keep GCC from optimizing this away somehow 962 - */ 963 - barrier(); 964 - return *ptr; 965 - } 966 - 967 957 void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) 968 958 { 969 959 int i, err;