Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'pm-4.7-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm

Pull power management fixes from Rafael Wysocki:
"One fix for a recent cpuidle core change that, against all odds,
introduced a functional regression on Power systems and the fix for
the crash during resume from hibernation on x86-64 that has been in
the works for the last few weeks (it actually was ready last week, but
I wanted to allow the reporters to test if for some more time).

Specifics:

- Fix a recent performance regression on Power systems (powernv and
pseries) introduced by a core cpuidle commit that decreased the
precision of the last_residency conversion from nano- to
microseconds, which should not matter in theory, but turned out to
play not-so-well with the special "snooze" idle state on Power
(Shreyas B Prabhu).

- Fix a crash during resume from hibernation on x86-64 caused by
possible corruption of the kernel text part of page tables in the
last phase of image restoration exposed by a security-related
change during the 4.3 development cycle (Rafael Wysocki)"

* tag 'pm-4.7-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm:
cpuidle: Fix last_residency division
x86/power/64: Fix kernel text mapping corruption during image restoration

+113 -51
+85 -12
arch/x86/power/hibernate_64.c
··· 19 19 #include <asm/mtrr.h> 20 20 #include <asm/sections.h> 21 21 #include <asm/suspend.h> 22 + #include <asm/tlbflush.h> 22 23 23 24 /* Defined in hibernate_asm_64.S */ 24 25 extern asmlinkage __visible int restore_image(void); ··· 29 28 * kernel's text (this value is passed in the image header). 30 29 */ 31 30 unsigned long restore_jump_address __visible; 31 + unsigned long jump_address_phys; 32 32 33 33 /* 34 34 * Value of the cr3 register from before the hibernation (this value is passed ··· 39 37 40 38 pgd_t *temp_level4_pgt __visible; 41 39 42 - void *relocated_restore_code __visible; 40 + unsigned long relocated_restore_code __visible; 41 + 42 + static int set_up_temporary_text_mapping(void) 43 + { 44 + pmd_t *pmd; 45 + pud_t *pud; 46 + 47 + /* 48 + * The new mapping only has to cover the page containing the image 49 + * kernel's entry point (jump_address_phys), because the switch over to 50 + * it is carried out by relocated code running from a page allocated 51 + * specifically for this purpose and covered by the identity mapping, so 52 + * the temporary kernel text mapping is only needed for the final jump. 53 + * Moreover, in that mapping the virtual address of the image kernel's 54 + * entry point must be the same as its virtual address in the image 55 + * kernel (restore_jump_address), so the image kernel's 56 + * restore_registers() code doesn't find itself in a different area of 57 + * the virtual address space after switching over to the original page 58 + * tables used by the image kernel. 59 + */ 60 + pud = (pud_t *)get_safe_page(GFP_ATOMIC); 61 + if (!pud) 62 + return -ENOMEM; 63 + 64 + pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); 65 + if (!pmd) 66 + return -ENOMEM; 67 + 68 + set_pmd(pmd + pmd_index(restore_jump_address), 69 + __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC)); 70 + set_pud(pud + pud_index(restore_jump_address), 71 + __pud(__pa(pmd) | _KERNPG_TABLE)); 72 + set_pgd(temp_level4_pgt + pgd_index(restore_jump_address), 73 + __pgd(__pa(pud) | _KERNPG_TABLE)); 74 + 75 + return 0; 76 + } 43 77 44 78 static void *alloc_pgt_page(void *context) 45 79 { ··· 97 59 if (!temp_level4_pgt) 98 60 return -ENOMEM; 99 61 100 - /* It is safe to reuse the original kernel mapping */ 101 - set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), 102 - init_level4_pgt[pgd_index(__START_KERNEL_map)]); 62 + /* Prepare a temporary mapping for the kernel text */ 63 + result = set_up_temporary_text_mapping(); 64 + if (result) 65 + return result; 103 66 104 67 /* Set up the direct mapping from scratch */ 105 68 for (i = 0; i < nr_pfn_mapped; i++) { ··· 117 78 return 0; 118 79 } 119 80 81 + static int relocate_restore_code(void) 82 + { 83 + pgd_t *pgd; 84 + pud_t *pud; 85 + 86 + relocated_restore_code = get_safe_page(GFP_ATOMIC); 87 + if (!relocated_restore_code) 88 + return -ENOMEM; 89 + 90 + memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE); 91 + 92 + /* Make the page containing the relocated code executable */ 93 + pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); 94 + pud = pud_offset(pgd, relocated_restore_code); 95 + if (pud_large(*pud)) { 96 + set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); 97 + } else { 98 + pmd_t *pmd = pmd_offset(pud, relocated_restore_code); 99 + 100 + if (pmd_large(*pmd)) { 101 + set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); 102 + } else { 103 + pte_t *pte = pte_offset_kernel(pmd, relocated_restore_code); 104 + 105 + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); 106 + } 107 + } 108 + __flush_tlb_all(); 109 + 110 + return 0; 111 + } 112 + 120 113 int swsusp_arch_resume(void) 121 114 { 122 115 int error; 123 116 124 117 /* We have got enough memory and from now on we cannot recover */ 125 - if ((error = set_up_temporary_mappings())) 118 + error = set_up_temporary_mappings(); 119 + if (error) 126 120 return error; 127 121 128 - relocated_restore_code = (void *)get_safe_page(GFP_ATOMIC); 129 - if (!relocated_restore_code) 130 - return -ENOMEM; 131 - memcpy(relocated_restore_code, &core_restore_code, 132 - &restore_registers - &core_restore_code); 122 + error = relocate_restore_code(); 123 + if (error) 124 + return error; 133 125 134 126 restore_image(); 135 127 return 0; ··· 179 109 180 110 struct restore_data_record { 181 111 unsigned long jump_address; 112 + unsigned long jump_address_phys; 182 113 unsigned long cr3; 183 114 unsigned long magic; 184 115 }; 185 116 186 - #define RESTORE_MAGIC 0x0123456789ABCDEFUL 117 + #define RESTORE_MAGIC 0x123456789ABCDEF0UL 187 118 188 119 /** 189 120 * arch_hibernation_header_save - populate the architecture specific part ··· 197 126 198 127 if (max_size < sizeof(struct restore_data_record)) 199 128 return -EOVERFLOW; 200 - rdr->jump_address = restore_jump_address; 129 + rdr->jump_address = (unsigned long)&restore_registers; 130 + rdr->jump_address_phys = __pa_symbol(&restore_registers); 201 131 rdr->cr3 = restore_cr3; 202 132 rdr->magic = RESTORE_MAGIC; 203 133 return 0; ··· 214 142 struct restore_data_record *rdr = addr; 215 143 216 144 restore_jump_address = rdr->jump_address; 145 + jump_address_phys = rdr->jump_address_phys; 217 146 restore_cr3 = rdr->cr3; 218 147 return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; 219 148 }
+24 -31
arch/x86/power/hibernate_asm_64.S
··· 44 44 pushfq 45 45 popq pt_regs_flags(%rax) 46 46 47 - /* save the address of restore_registers */ 48 - movq $restore_registers, %rax 49 - movq %rax, restore_jump_address(%rip) 50 47 /* save cr3 */ 51 48 movq %cr3, %rax 52 49 movq %rax, restore_cr3(%rip) ··· 54 57 ENDPROC(swsusp_arch_suspend) 55 58 56 59 ENTRY(restore_image) 57 - /* switch to temporary page tables */ 58 - movq $__PAGE_OFFSET, %rdx 59 - movq temp_level4_pgt(%rip), %rax 60 - subq %rdx, %rax 61 - movq %rax, %cr3 62 - /* Flush TLB */ 63 - movq mmu_cr4_features(%rip), %rax 64 - movq %rax, %rdx 65 - andq $~(X86_CR4_PGE), %rdx 66 - movq %rdx, %cr4; # turn off PGE 67 - movq %cr3, %rcx; # flush TLB 68 - movq %rcx, %cr3; 69 - movq %rax, %cr4; # turn PGE back on 70 - 71 60 /* prepare to jump to the image kernel */ 72 - movq restore_jump_address(%rip), %rax 73 - movq restore_cr3(%rip), %rbx 61 + movq restore_jump_address(%rip), %r8 62 + movq restore_cr3(%rip), %r9 63 + 64 + /* prepare to switch to temporary page tables */ 65 + movq temp_level4_pgt(%rip), %rax 66 + movq mmu_cr4_features(%rip), %rbx 74 67 75 68 /* prepare to copy image data to their original locations */ 76 69 movq restore_pblist(%rip), %rdx 70 + 71 + /* jump to relocated restore code */ 77 72 movq relocated_restore_code(%rip), %rcx 78 73 jmpq *%rcx 79 74 80 75 /* code below has been relocated to a safe page */ 81 76 ENTRY(core_restore_code) 77 + /* switch to temporary page tables */ 78 + movq $__PAGE_OFFSET, %rcx 79 + subq %rcx, %rax 80 + movq %rax, %cr3 81 + /* flush TLB */ 82 + movq %rbx, %rcx 83 + andq $~(X86_CR4_PGE), %rcx 84 + movq %rcx, %cr4; # turn off PGE 85 + movq %cr3, %rcx; # flush TLB 86 + movq %rcx, %cr3; 87 + movq %rbx, %cr4; # turn PGE back on 82 88 .Lloop: 83 89 testq %rdx, %rdx 84 90 jz .Ldone ··· 96 96 /* progress to the next pbe */ 97 97 movq pbe_next(%rdx), %rdx 98 98 jmp .Lloop 99 + 99 100 .Ldone: 100 101 /* jump to the restore_registers address from the image header */ 101 - jmpq *%rax 102 - /* 103 - * NOTE: This assumes that the boot kernel's text mapping covers the 104 - * image kernel's page containing restore_registers and the address of 105 - * this page is the same as in the image kernel's text mapping (it 106 - * should always be true, because the text mapping is linear, starting 107 - * from 0, and is supposed to cover the entire kernel text for every 108 - * kernel). 109 - * 110 - * code below belongs to the image kernel 111 - */ 102 + jmpq *%r8 112 103 104 + /* code below belongs to the image kernel */ 105 + .align PAGE_SIZE 113 106 ENTRY(restore_registers) 114 107 FRAME_BEGIN 115 108 /* go back to the original page tables */ 116 - movq %rbx, %cr3 109 + movq %r9, %cr3 117 110 118 111 /* Flush TLB, including "global" things (vmalloc) */ 119 112 movq mmu_cr4_features(%rip), %rax
+4 -8
drivers/cpuidle/cpuidle.c
··· 173 173 174 174 struct cpuidle_state *target_state = &drv->states[index]; 175 175 bool broadcast = !!(target_state->flags & CPUIDLE_FLAG_TIMER_STOP); 176 - u64 time_start, time_end; 176 + ktime_t time_start, time_end; 177 177 s64 diff; 178 178 179 179 /* ··· 195 195 sched_idle_set_state(target_state); 196 196 197 197 trace_cpu_idle_rcuidle(index, dev->cpu); 198 - time_start = local_clock(); 198 + time_start = ns_to_ktime(local_clock()); 199 199 200 200 stop_critical_timings(); 201 201 entered_state = target_state->enter(dev, drv, index); 202 202 start_critical_timings(); 203 203 204 - time_end = local_clock(); 204 + time_end = ns_to_ktime(local_clock()); 205 205 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); 206 206 207 207 /* The cpu is no longer idle or about to enter idle. */ ··· 217 217 if (!cpuidle_state_is_coupled(drv, index)) 218 218 local_irq_enable(); 219 219 220 - /* 221 - * local_clock() returns the time in nanosecond, let's shift 222 - * by 10 (divide by 1024) to have microsecond based time. 223 - */ 224 - diff = (time_end - time_start) >> 10; 220 + diff = ktime_us_delta(time_end, time_start); 225 221 if (diff > INT_MAX) 226 222 diff = INT_MAX; 227 223