Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Ingo Molnar:
"Two families of fixes:

- Fix an FPU context related boot crash on newer x86 hardware with
larger context sizes than what most people test. To fix this
without ugly kludges or extensive reverts we had to touch core task
allocator, to allow x86 to determine the task size dynamically, at
boot time.

I've tested it on a number of x86 platforms, and I cross-built it
to a handful of architectures:

(warns) (warns)
testing x86-64: -git: pass ( 0), -tip: pass ( 0)
testing x86-32: -git: pass ( 0), -tip: pass ( 0)
testing arm: -git: pass ( 1359), -tip: pass ( 1359)
testing cris: -git: pass ( 1031), -tip: pass ( 1031)
testing m32r: -git: pass ( 1135), -tip: pass ( 1135)
testing m68k: -git: pass ( 1471), -tip: pass ( 1471)
testing mips: -git: pass ( 1162), -tip: pass ( 1162)
testing mn10300: -git: pass ( 1058), -tip: pass ( 1058)
testing parisc: -git: pass ( 1846), -tip: pass ( 1846)
testing sparc: -git: pass ( 1185), -tip: pass ( 1185)

... so I hope the cross-arch impact 'none', as intended.

(by Dave Hansen)

- Fix various NMI handling related bugs unearthed by the big asm code
rewrite and generally make the NMI code more robust and more
maintainable while at it. These changes are a bit late in the
cycle, I hope they are still acceptable.

(by Andy Lutomirski)"

* 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/fpu, sched: Introduce CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT and use it on x86
x86/fpu, sched: Dynamically allocate 'struct fpu'
x86/entry/64, x86/nmi/64: Add CONFIG_DEBUG_ENTRY NMI testing code
x86/nmi/64: Make the "NMI executing" variable more consistent
x86/nmi/64: Minor asm simplification
x86/nmi/64: Use DF to avoid userspace RSP confusing nested NMI detection
x86/nmi/64: Reorder nested NMI checks
x86/nmi/64: Improve nested NMI comments
x86/nmi/64: Switch stacks on userspace NMI entry
x86/nmi/64: Remove asm code that saves CR2
x86/nmi: Enable nested do_nmi() handling for 64-bit kernels

+379 -217
+4
arch/Kconfig
··· 221 221 config ARCH_THREAD_INFO_ALLOCATOR 222 222 bool 223 223 224 + # Select if arch wants to size task_struct dynamically via arch_task_struct_size: 225 + config ARCH_WANTS_DYNAMIC_TASK_STRUCT 226 + bool 227 + 224 228 config HAVE_REGS_AND_STACK_ACCESS_API 225 229 bool 226 230 help
+1
arch/x86/Kconfig
··· 41 41 select ARCH_USE_CMPXCHG_LOCKREF if X86_64 42 42 select ARCH_USE_QUEUED_RWLOCKS 43 43 select ARCH_USE_QUEUED_SPINLOCKS 44 + select ARCH_WANTS_DYNAMIC_TASK_STRUCT 44 45 select ARCH_WANT_FRAME_POINTERS 45 46 select ARCH_WANT_IPC_PARSE_VERSION if X86_32 46 47 select ARCH_WANT_OPTIONAL_GPIOLIB
+12
arch/x86/Kconfig.debug
··· 297 297 298 298 If unsure, say N. 299 299 300 + config DEBUG_ENTRY 301 + bool "Debug low-level entry code" 302 + depends on DEBUG_KERNEL 303 + ---help--- 304 + This option enables sanity checks in x86's low-level entry code. 305 + Some of these sanity checks may slow down kernel entries and 306 + exits or otherwise impact performance. 307 + 308 + This is currently used to help test NMI code. 309 + 310 + If unsure, say N. 311 + 300 312 config DEBUG_NMI_SELFTEST 301 313 bool "NMI Selftest" 302 314 depends on DEBUG_KERNEL && X86_LOCAL_APIC
+202 -103
arch/x86/entry/entry_64.S
··· 1237 1237 * If the variable is not set and the stack is not the NMI 1238 1238 * stack then: 1239 1239 * o Set the special variable on the stack 1240 - * o Copy the interrupt frame into a "saved" location on the stack 1241 - * o Copy the interrupt frame into a "copy" location on the stack 1240 + * o Copy the interrupt frame into an "outermost" location on the 1241 + * stack 1242 + * o Copy the interrupt frame into an "iret" location on the stack 1242 1243 * o Continue processing the NMI 1243 1244 * If the variable is set or the previous stack is the NMI stack: 1244 - * o Modify the "copy" location to jump to the repeate_nmi 1245 + * o Modify the "iret" location to jump to the repeat_nmi 1245 1246 * o return back to the first NMI 1246 1247 * 1247 1248 * Now on exit of the first NMI, we first clear the stack variable ··· 1251 1250 * a nested NMI that updated the copy interrupt stack frame, a 1252 1251 * jump will be made to the repeat_nmi code that will handle the second 1253 1252 * NMI. 1253 + * 1254 + * However, espfix prevents us from directly returning to userspace 1255 + * with a single IRET instruction. Similarly, IRET to user mode 1256 + * can fault. We therefore handle NMIs from user space like 1257 + * other IST entries. 1254 1258 */ 1255 1259 1256 1260 /* Use %rdx as our temp variable throughout */ 1257 1261 pushq %rdx 1258 1262 1259 - /* 1260 - * If %cs was not the kernel segment, then the NMI triggered in user 1261 - * space, which means it is definitely not nested. 1262 - */ 1263 - cmpl $__KERNEL_CS, 16(%rsp) 1264 - jne first_nmi 1263 + testb $3, CS-RIP+8(%rsp) 1264 + jz .Lnmi_from_kernel 1265 1265 1266 1266 /* 1267 - * Check the special variable on the stack to see if NMIs are 1268 - * executing. 1267 + * NMI from user mode. We need to run on the thread stack, but we 1268 + * can't go through the normal entry paths: NMIs are masked, and 1269 + * we don't want to enable interrupts, because then we'll end 1270 + * up in an awkward situation in which IRQs are on but NMIs 1271 + * are off. 1272 + */ 1273 + 1274 + SWAPGS 1275 + cld 1276 + movq %rsp, %rdx 1277 + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp 1278 + pushq 5*8(%rdx) /* pt_regs->ss */ 1279 + pushq 4*8(%rdx) /* pt_regs->rsp */ 1280 + pushq 3*8(%rdx) /* pt_regs->flags */ 1281 + pushq 2*8(%rdx) /* pt_regs->cs */ 1282 + pushq 1*8(%rdx) /* pt_regs->rip */ 1283 + pushq $-1 /* pt_regs->orig_ax */ 1284 + pushq %rdi /* pt_regs->di */ 1285 + pushq %rsi /* pt_regs->si */ 1286 + pushq (%rdx) /* pt_regs->dx */ 1287 + pushq %rcx /* pt_regs->cx */ 1288 + pushq %rax /* pt_regs->ax */ 1289 + pushq %r8 /* pt_regs->r8 */ 1290 + pushq %r9 /* pt_regs->r9 */ 1291 + pushq %r10 /* pt_regs->r10 */ 1292 + pushq %r11 /* pt_regs->r11 */ 1293 + pushq %rbx /* pt_regs->rbx */ 1294 + pushq %rbp /* pt_regs->rbp */ 1295 + pushq %r12 /* pt_regs->r12 */ 1296 + pushq %r13 /* pt_regs->r13 */ 1297 + pushq %r14 /* pt_regs->r14 */ 1298 + pushq %r15 /* pt_regs->r15 */ 1299 + 1300 + /* 1301 + * At this point we no longer need to worry about stack damage 1302 + * due to nesting -- we're on the normal thread stack and we're 1303 + * done with the NMI stack. 1304 + */ 1305 + 1306 + movq %rsp, %rdi 1307 + movq $-1, %rsi 1308 + call do_nmi 1309 + 1310 + /* 1311 + * Return back to user mode. We must *not* do the normal exit 1312 + * work, because we don't want to enable interrupts. Fortunately, 1313 + * do_nmi doesn't modify pt_regs. 1314 + */ 1315 + SWAPGS 1316 + jmp restore_c_regs_and_iret 1317 + 1318 + .Lnmi_from_kernel: 1319 + /* 1320 + * Here's what our stack frame will look like: 1321 + * +---------------------------------------------------------+ 1322 + * | original SS | 1323 + * | original Return RSP | 1324 + * | original RFLAGS | 1325 + * | original CS | 1326 + * | original RIP | 1327 + * +---------------------------------------------------------+ 1328 + * | temp storage for rdx | 1329 + * +---------------------------------------------------------+ 1330 + * | "NMI executing" variable | 1331 + * +---------------------------------------------------------+ 1332 + * | iret SS } Copied from "outermost" frame | 1333 + * | iret Return RSP } on each loop iteration; overwritten | 1334 + * | iret RFLAGS } by a nested NMI to force another | 1335 + * | iret CS } iteration if needed. | 1336 + * | iret RIP } | 1337 + * +---------------------------------------------------------+ 1338 + * | outermost SS } initialized in first_nmi; | 1339 + * | outermost Return RSP } will not be changed before | 1340 + * | outermost RFLAGS } NMI processing is done. | 1341 + * | outermost CS } Copied to "iret" frame on each | 1342 + * | outermost RIP } iteration. | 1343 + * +---------------------------------------------------------+ 1344 + * | pt_regs | 1345 + * +---------------------------------------------------------+ 1346 + * 1347 + * The "original" frame is used by hardware. Before re-enabling 1348 + * NMIs, we need to be done with it, and we need to leave enough 1349 + * space for the asm code here. 1350 + * 1351 + * We return by executing IRET while RSP points to the "iret" frame. 1352 + * That will either return for real or it will loop back into NMI 1353 + * processing. 1354 + * 1355 + * The "outermost" frame is copied to the "iret" frame on each 1356 + * iteration of the loop, so each iteration starts with the "iret" 1357 + * frame pointing to the final return target. 1358 + */ 1359 + 1360 + /* 1361 + * Determine whether we're a nested NMI. 1362 + * 1363 + * If we interrupted kernel code between repeat_nmi and 1364 + * end_repeat_nmi, then we are a nested NMI. We must not 1365 + * modify the "iret" frame because it's being written by 1366 + * the outer NMI. That's okay; the outer NMI handler is 1367 + * about to about to call do_nmi anyway, so we can just 1368 + * resume the outer NMI. 1369 + */ 1370 + 1371 + movq $repeat_nmi, %rdx 1372 + cmpq 8(%rsp), %rdx 1373 + ja 1f 1374 + movq $end_repeat_nmi, %rdx 1375 + cmpq 8(%rsp), %rdx 1376 + ja nested_nmi_out 1377 + 1: 1378 + 1379 + /* 1380 + * Now check "NMI executing". If it's set, then we're nested. 1381 + * This will not detect if we interrupted an outer NMI just 1382 + * before IRET. 1269 1383 */ 1270 1384 cmpl $1, -8(%rsp) 1271 1385 je nested_nmi 1272 1386 1273 1387 /* 1274 - * Now test if the previous stack was an NMI stack. 1275 - * We need the double check. We check the NMI stack to satisfy the 1276 - * race when the first NMI clears the variable before returning. 1277 - * We check the variable because the first NMI could be in a 1278 - * breakpoint routine using a breakpoint stack. 1388 + * Now test if the previous stack was an NMI stack. This covers 1389 + * the case where we interrupt an outer NMI after it clears 1390 + * "NMI executing" but before IRET. We need to be careful, though: 1391 + * there is one case in which RSP could point to the NMI stack 1392 + * despite there being no NMI active: naughty userspace controls 1393 + * RSP at the very beginning of the SYSCALL targets. We can 1394 + * pull a fast one on naughty userspace, though: we program 1395 + * SYSCALL to mask DF, so userspace cannot cause DF to be set 1396 + * if it controls the kernel's RSP. We set DF before we clear 1397 + * "NMI executing". 1279 1398 */ 1280 1399 lea 6*8(%rsp), %rdx 1281 1400 /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ ··· 1407 1286 cmpq %rdx, 4*8(%rsp) 1408 1287 /* If it is below the NMI stack, it is a normal NMI */ 1409 1288 jb first_nmi 1410 - /* Ah, it is within the NMI stack, treat it as nested */ 1289 + 1290 + /* Ah, it is within the NMI stack. */ 1291 + 1292 + testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp) 1293 + jz first_nmi /* RSP was user controlled. */ 1294 + 1295 + /* This is a nested NMI. */ 1411 1296 1412 1297 nested_nmi: 1413 1298 /* 1414 - * Do nothing if we interrupted the fixup in repeat_nmi. 1415 - * It's about to repeat the NMI handler, so we are fine 1416 - * with ignoring this one. 1299 + * Modify the "iret" frame to point to repeat_nmi, forcing another 1300 + * iteration of NMI handling. 1417 1301 */ 1418 - movq $repeat_nmi, %rdx 1419 - cmpq 8(%rsp), %rdx 1420 - ja 1f 1421 - movq $end_repeat_nmi, %rdx 1422 - cmpq 8(%rsp), %rdx 1423 - ja nested_nmi_out 1424 - 1425 - 1: 1426 - /* Set up the interrupted NMIs stack to jump to repeat_nmi */ 1427 - leaq -1*8(%rsp), %rdx 1428 - movq %rdx, %rsp 1302 + subq $8, %rsp 1429 1303 leaq -10*8(%rsp), %rdx 1430 1304 pushq $__KERNEL_DS 1431 1305 pushq %rdx ··· 1434 1318 nested_nmi_out: 1435 1319 popq %rdx 1436 1320 1437 - /* No need to check faults here */ 1321 + /* We are returning to kernel mode, so this cannot result in a fault. */ 1438 1322 INTERRUPT_RETURN 1439 1323 1440 1324 first_nmi: 1441 - /* 1442 - * Because nested NMIs will use the pushed location that we 1443 - * stored in rdx, we must keep that space available. 1444 - * Here's what our stack frame will look like: 1445 - * +-------------------------+ 1446 - * | original SS | 1447 - * | original Return RSP | 1448 - * | original RFLAGS | 1449 - * | original CS | 1450 - * | original RIP | 1451 - * +-------------------------+ 1452 - * | temp storage for rdx | 1453 - * +-------------------------+ 1454 - * | NMI executing variable | 1455 - * +-------------------------+ 1456 - * | copied SS | 1457 - * | copied Return RSP | 1458 - * | copied RFLAGS | 1459 - * | copied CS | 1460 - * | copied RIP | 1461 - * +-------------------------+ 1462 - * | Saved SS | 1463 - * | Saved Return RSP | 1464 - * | Saved RFLAGS | 1465 - * | Saved CS | 1466 - * | Saved RIP | 1467 - * +-------------------------+ 1468 - * | pt_regs | 1469 - * +-------------------------+ 1470 - * 1471 - * The saved stack frame is used to fix up the copied stack frame 1472 - * that a nested NMI may change to make the interrupted NMI iret jump 1473 - * to the repeat_nmi. The original stack frame and the temp storage 1474 - * is also used by nested NMIs and can not be trusted on exit. 1475 - */ 1476 - /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ 1325 + /* Restore rdx. */ 1477 1326 movq (%rsp), %rdx 1478 1327 1479 - /* Set the NMI executing variable on the stack. */ 1480 - pushq $1 1328 + /* Make room for "NMI executing". */ 1329 + pushq $0 1481 1330 1482 - /* Leave room for the "copied" frame */ 1331 + /* Leave room for the "iret" frame */ 1483 1332 subq $(5*8), %rsp 1484 1333 1485 - /* Copy the stack frame to the Saved frame */ 1334 + /* Copy the "original" frame to the "outermost" frame */ 1486 1335 .rept 5 1487 1336 pushq 11*8(%rsp) 1488 1337 .endr 1489 1338 1490 1339 /* Everything up to here is safe from nested NMIs */ 1491 1340 1341 + #ifdef CONFIG_DEBUG_ENTRY 1342 + /* 1343 + * For ease of testing, unmask NMIs right away. Disabled by 1344 + * default because IRET is very expensive. 1345 + */ 1346 + pushq $0 /* SS */ 1347 + pushq %rsp /* RSP (minus 8 because of the previous push) */ 1348 + addq $8, (%rsp) /* Fix up RSP */ 1349 + pushfq /* RFLAGS */ 1350 + pushq $__KERNEL_CS /* CS */ 1351 + pushq $1f /* RIP */ 1352 + INTERRUPT_RETURN /* continues at repeat_nmi below */ 1353 + 1: 1354 + #endif 1355 + 1356 + repeat_nmi: 1492 1357 /* 1493 1358 * If there was a nested NMI, the first NMI's iret will return 1494 1359 * here. But NMIs are still enabled and we can take another ··· 1478 1381 * it will just return, as we are about to repeat an NMI anyway. 1479 1382 * This makes it safe to copy to the stack frame that a nested 1480 1383 * NMI will update. 1384 + * 1385 + * RSP is pointing to "outermost RIP". gsbase is unknown, but, if 1386 + * we're repeating an NMI, gsbase has the same value that it had on 1387 + * the first iteration. paranoid_entry will load the kernel 1388 + * gsbase if needed before we call do_nmi. "NMI executing" 1389 + * is zero. 1481 1390 */ 1482 - repeat_nmi: 1483 - /* 1484 - * Update the stack variable to say we are still in NMI (the update 1485 - * is benign for the non-repeat case, where 1 was pushed just above 1486 - * to this very stack slot). 1487 - */ 1488 - movq $1, 10*8(%rsp) 1391 + movq $1, 10*8(%rsp) /* Set "NMI executing". */ 1489 1392 1490 - /* Make another copy, this one may be modified by nested NMIs */ 1393 + /* 1394 + * Copy the "outermost" frame to the "iret" frame. NMIs that nest 1395 + * here must not modify the "iret" frame while we're writing to 1396 + * it or it will end up containing garbage. 1397 + */ 1491 1398 addq $(10*8), %rsp 1492 1399 .rept 5 1493 1400 pushq -6*8(%rsp) ··· 1500 1399 end_repeat_nmi: 1501 1400 1502 1401 /* 1503 - * Everything below this point can be preempted by a nested 1504 - * NMI if the first NMI took an exception and reset our iret stack 1505 - * so that we repeat another NMI. 1402 + * Everything below this point can be preempted by a nested NMI. 1403 + * If this happens, then the inner NMI will change the "iret" 1404 + * frame to point back to repeat_nmi. 1506 1405 */ 1507 1406 pushq $-1 /* ORIG_RAX: no syscall to restart */ 1508 1407 ALLOC_PT_GPREGS_ON_STACK ··· 1516 1415 */ 1517 1416 call paranoid_entry 1518 1417 1519 - /* 1520 - * Save off the CR2 register. If we take a page fault in the NMI then 1521 - * it could corrupt the CR2 value. If the NMI preempts a page fault 1522 - * handler before it was able to read the CR2 register, and then the 1523 - * NMI itself takes a page fault, the page fault that was preempted 1524 - * will read the information from the NMI page fault and not the 1525 - * origin fault. Save it off and restore it if it changes. 1526 - * Use the r12 callee-saved register. 1527 - */ 1528 - movq %cr2, %r12 1529 - 1530 1418 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1531 1419 movq %rsp, %rdi 1532 1420 movq $-1, %rsi 1533 1421 call do_nmi 1534 1422 1535 - /* Did the NMI take a page fault? Restore cr2 if it did */ 1536 - movq %cr2, %rcx 1537 - cmpq %rcx, %r12 1538 - je 1f 1539 - movq %r12, %cr2 1540 - 1: 1541 1423 testl %ebx, %ebx /* swapgs needed? */ 1542 1424 jnz nmi_restore 1543 1425 nmi_swapgs: ··· 1528 1444 nmi_restore: 1529 1445 RESTORE_EXTRA_REGS 1530 1446 RESTORE_C_REGS 1531 - /* Pop the extra iret frame at once */ 1447 + 1448 + /* Point RSP at the "iret" frame. */ 1532 1449 REMOVE_PT_GPREGS_FROM_STACK 6*8 1533 1450 1534 - /* Clear the NMI executing stack variable */ 1535 - movq $0, 5*8(%rsp) 1451 + /* 1452 + * Clear "NMI executing". Set DF first so that we can easily 1453 + * distinguish the remaining code between here and IRET from 1454 + * the SYSCALL entry and exit paths. On a native kernel, we 1455 + * could just inspect RIP, but, on paravirt kernels, 1456 + * INTERRUPT_RETURN can translate into a jump into a 1457 + * hypercall page. 1458 + */ 1459 + std 1460 + movq $0, 5*8(%rsp) /* clear "NMI executing" */ 1461 + 1462 + /* 1463 + * INTERRUPT_RETURN reads the "iret" frame and exits the NMI 1464 + * stack in a single instruction. We are returning to kernel 1465 + * mode, so this cannot result in a fault. 1466 + */ 1536 1467 INTERRUPT_RETURN 1537 1468 END(nmi) 1538 1469
+38 -34
arch/x86/include/asm/fpu/types.h
··· 189 189 struct fxregs_state fxsave; 190 190 struct swregs_state soft; 191 191 struct xregs_state xsave; 192 + u8 __padding[PAGE_SIZE]; 192 193 }; 193 194 194 195 /* ··· 198 197 * state fields: 199 198 */ 200 199 struct fpu { 201 - /* 202 - * @state: 203 - * 204 - * In-memory copy of all FPU registers that we save/restore 205 - * over context switches. If the task is using the FPU then 206 - * the registers in the FPU are more recent than this state 207 - * copy. If the task context-switches away then they get 208 - * saved here and represent the FPU state. 209 - * 210 - * After context switches there may be a (short) time period 211 - * during which the in-FPU hardware registers are unchanged 212 - * and still perfectly match this state, if the tasks 213 - * scheduled afterwards are not using the FPU. 214 - * 215 - * This is the 'lazy restore' window of optimization, which 216 - * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'. 217 - * 218 - * We detect whether a subsequent task uses the FPU via setting 219 - * CR0::TS to 1, which causes any FPU use to raise a #NM fault. 220 - * 221 - * During this window, if the task gets scheduled again, we 222 - * might be able to skip having to do a restore from this 223 - * memory buffer to the hardware registers - at the cost of 224 - * incurring the overhead of #NM fault traps. 225 - * 226 - * Note that on modern CPUs that support the XSAVEOPT (or other 227 - * optimized XSAVE instructions), we don't use #NM traps anymore, 228 - * as the hardware can track whether FPU registers need saving 229 - * or not. On such CPUs we activate the non-lazy ('eagerfpu') 230 - * logic, which unconditionally saves/restores all FPU state 231 - * across context switches. (if FPU state exists.) 232 - */ 233 - union fpregs_state state; 234 - 235 200 /* 236 201 * @last_cpu: 237 202 * ··· 255 288 * deal with bursty apps that only use the FPU for a short time: 256 289 */ 257 290 unsigned char counter; 291 + /* 292 + * @state: 293 + * 294 + * In-memory copy of all FPU registers that we save/restore 295 + * over context switches. If the task is using the FPU then 296 + * the registers in the FPU are more recent than this state 297 + * copy. If the task context-switches away then they get 298 + * saved here and represent the FPU state. 299 + * 300 + * After context switches there may be a (short) time period 301 + * during which the in-FPU hardware registers are unchanged 302 + * and still perfectly match this state, if the tasks 303 + * scheduled afterwards are not using the FPU. 304 + * 305 + * This is the 'lazy restore' window of optimization, which 306 + * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'. 307 + * 308 + * We detect whether a subsequent task uses the FPU via setting 309 + * CR0::TS to 1, which causes any FPU use to raise a #NM fault. 310 + * 311 + * During this window, if the task gets scheduled again, we 312 + * might be able to skip having to do a restore from this 313 + * memory buffer to the hardware registers - at the cost of 314 + * incurring the overhead of #NM fault traps. 315 + * 316 + * Note that on modern CPUs that support the XSAVEOPT (or other 317 + * optimized XSAVE instructions), we don't use #NM traps anymore, 318 + * as the hardware can track whether FPU registers need saving 319 + * or not. On such CPUs we activate the non-lazy ('eagerfpu') 320 + * logic, which unconditionally saves/restores all FPU state 321 + * across context switches. (if FPU state exists.) 322 + */ 323 + union fpregs_state state; 324 + /* 325 + * WARNING: 'state' is dynamically-sized. Do not put 326 + * anything after it here. 327 + */ 258 328 }; 259 329 260 330 #endif /* _ASM_X86_FPU_H */
+7 -3
arch/x86/include/asm/processor.h
··· 390 390 #endif 391 391 unsigned long gs; 392 392 393 - /* Floating point and extended processor state */ 394 - struct fpu fpu; 395 - 396 393 /* Save middle states of ptrace breakpoints */ 397 394 struct perf_event *ptrace_bps[HBP_NUM]; 398 395 /* Debug status used for traps, single steps, etc... */ ··· 415 418 unsigned long iopl; 416 419 /* Max allowed port in the bitmap, in bytes: */ 417 420 unsigned io_bitmap_max; 421 + 422 + /* Floating point and extended processor state */ 423 + struct fpu fpu; 424 + /* 425 + * WARNING: 'fpu' is dynamically-sized. It *MUST* be at 426 + * the end. 427 + */ 418 428 }; 419 429 420 430 /*
+40
arch/x86/kernel/fpu/init.c
··· 4 4 #include <asm/fpu/internal.h> 5 5 #include <asm/tlbflush.h> 6 6 7 + #include <linux/sched.h> 8 + 7 9 /* 8 10 * Initialize the TS bit in CR0 according to the style of context-switches 9 11 * we are using: ··· 137 135 */ 138 136 unsigned int xstate_size; 139 137 EXPORT_SYMBOL_GPL(xstate_size); 138 + 139 + /* Enforce that 'MEMBER' is the last field of 'TYPE': */ 140 + #define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \ 141 + BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER)) 142 + 143 + /* 144 + * We append the 'struct fpu' to the task_struct: 145 + */ 146 + static void __init fpu__init_task_struct_size(void) 147 + { 148 + int task_size = sizeof(struct task_struct); 149 + 150 + /* 151 + * Subtract off the static size of the register state. 152 + * It potentially has a bunch of padding. 153 + */ 154 + task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state); 155 + 156 + /* 157 + * Add back the dynamically-calculated register state 158 + * size. 159 + */ 160 + task_size += xstate_size; 161 + 162 + /* 163 + * We dynamically size 'struct fpu', so we require that 164 + * it be at the end of 'thread_struct' and that 165 + * 'thread_struct' be at the end of 'task_struct'. If 166 + * you hit a compile error here, check the structure to 167 + * see if something got added to the end. 168 + */ 169 + CHECK_MEMBER_AT_END_OF(struct fpu, state); 170 + CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu); 171 + CHECK_MEMBER_AT_END_OF(struct task_struct, thread); 172 + 173 + arch_task_struct_size = task_size; 174 + } 140 175 141 176 /* 142 177 * Set up the xstate_size based on the legacy FPU context size. ··· 326 287 fpu__init_system_generic(); 327 288 fpu__init_system_xstate_size_legacy(); 328 289 fpu__init_system_xstate(); 290 + fpu__init_task_struct_size(); 329 291 330 292 fpu__init_system_ctx_switch(); 331 293 }
+52 -71
arch/x86/kernel/nmi.c
··· 408 408 NOKPROBE_SYMBOL(default_do_nmi); 409 409 410 410 /* 411 - * NMIs can hit breakpoints which will cause it to lose its 412 - * NMI context with the CPU when the breakpoint does an iret. 413 - */ 414 - #ifdef CONFIG_X86_32 415 - /* 416 - * For i386, NMIs use the same stack as the kernel, and we can 417 - * add a workaround to the iret problem in C (preventing nested 418 - * NMIs if an NMI takes a trap). Simply have 3 states the NMI 419 - * can be in: 411 + * NMIs can page fault or hit breakpoints which will cause it to lose 412 + * its NMI context with the CPU when the breakpoint or page fault does an IRET. 413 + * 414 + * As a result, NMIs can nest if NMIs get unmasked due an IRET during 415 + * NMI processing. On x86_64, the asm glue protects us from nested NMIs 416 + * if the outer NMI came from kernel mode, but we can still nest if the 417 + * outer NMI came from user mode. 418 + * 419 + * To handle these nested NMIs, we have three states: 420 420 * 421 421 * 1) not running 422 422 * 2) executing ··· 430 430 * (Note, the latch is binary, thus multiple NMIs triggering, 431 431 * when one is running, are ignored. Only one NMI is restarted.) 432 432 * 433 - * If an NMI hits a breakpoint that executes an iret, another 434 - * NMI can preempt it. We do not want to allow this new NMI 435 - * to run, but we want to execute it when the first one finishes. 436 - * We set the state to "latched", and the exit of the first NMI will 437 - * perform a dec_return, if the result is zero (NOT_RUNNING), then 438 - * it will simply exit the NMI handler. If not, the dec_return 439 - * would have set the state to NMI_EXECUTING (what we want it to 440 - * be when we are running). In this case, we simply jump back 441 - * to rerun the NMI handler again, and restart the 'latched' NMI. 433 + * If an NMI executes an iret, another NMI can preempt it. We do not 434 + * want to allow this new NMI to run, but we want to execute it when the 435 + * first one finishes. We set the state to "latched", and the exit of 436 + * the first NMI will perform a dec_return, if the result is zero 437 + * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the 438 + * dec_return would have set the state to NMI_EXECUTING (what we want it 439 + * to be when we are running). In this case, we simply jump back to 440 + * rerun the NMI handler again, and restart the 'latched' NMI. 442 441 * 443 442 * No trap (breakpoint or page fault) should be hit before nmi_restart, 444 443 * thus there is no race between the first check of state for NOT_RUNNING ··· 460 461 static DEFINE_PER_CPU(enum nmi_states, nmi_state); 461 462 static DEFINE_PER_CPU(unsigned long, nmi_cr2); 462 463 463 - #define nmi_nesting_preprocess(regs) \ 464 - do { \ 465 - if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \ 466 - this_cpu_write(nmi_state, NMI_LATCHED); \ 467 - return; \ 468 - } \ 469 - this_cpu_write(nmi_state, NMI_EXECUTING); \ 470 - this_cpu_write(nmi_cr2, read_cr2()); \ 471 - } while (0); \ 472 - nmi_restart: 473 - 474 - #define nmi_nesting_postprocess() \ 475 - do { \ 476 - if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \ 477 - write_cr2(this_cpu_read(nmi_cr2)); \ 478 - if (this_cpu_dec_return(nmi_state)) \ 479 - goto nmi_restart; \ 480 - } while (0) 481 - #else /* x86_64 */ 464 + #ifdef CONFIG_X86_64 482 465 /* 483 - * In x86_64 things are a bit more difficult. This has the same problem 484 - * where an NMI hitting a breakpoint that calls iret will remove the 485 - * NMI context, allowing a nested NMI to enter. What makes this more 486 - * difficult is that both NMIs and breakpoints have their own stack. 487 - * When a new NMI or breakpoint is executed, the stack is set to a fixed 488 - * point. If an NMI is nested, it will have its stack set at that same 489 - * fixed address that the first NMI had, and will start corrupting the 490 - * stack. This is handled in entry_64.S, but the same problem exists with 491 - * the breakpoint stack. 466 + * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without 467 + * some care, the inner breakpoint will clobber the outer breakpoint's 468 + * stack. 492 469 * 493 - * If a breakpoint is being processed, and the debug stack is being used, 494 - * if an NMI comes in and also hits a breakpoint, the stack pointer 495 - * will be set to the same fixed address as the breakpoint that was 496 - * interrupted, causing that stack to be corrupted. To handle this case, 497 - * check if the stack that was interrupted is the debug stack, and if 498 - * so, change the IDT so that new breakpoints will use the current stack 499 - * and not switch to the fixed address. On return of the NMI, switch back 500 - * to the original IDT. 470 + * If a breakpoint is being processed, and the debug stack is being 471 + * used, if an NMI comes in and also hits a breakpoint, the stack 472 + * pointer will be set to the same fixed address as the breakpoint that 473 + * was interrupted, causing that stack to be corrupted. To handle this 474 + * case, check if the stack that was interrupted is the debug stack, and 475 + * if so, change the IDT so that new breakpoints will use the current 476 + * stack and not switch to the fixed address. On return of the NMI, 477 + * switch back to the original IDT. 501 478 */ 502 479 static DEFINE_PER_CPU(int, update_debug_stack); 480 + #endif 503 481 504 - static inline void nmi_nesting_preprocess(struct pt_regs *regs) 482 + dotraplinkage notrace void 483 + do_nmi(struct pt_regs *regs, long error_code) 505 484 { 485 + if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { 486 + this_cpu_write(nmi_state, NMI_LATCHED); 487 + return; 488 + } 489 + this_cpu_write(nmi_state, NMI_EXECUTING); 490 + this_cpu_write(nmi_cr2, read_cr2()); 491 + nmi_restart: 492 + 493 + #ifdef CONFIG_X86_64 506 494 /* 507 495 * If we interrupted a breakpoint, it is possible that 508 496 * the nmi handler will have breakpoints too. We need to ··· 500 514 debug_stack_set_zero(); 501 515 this_cpu_write(update_debug_stack, 1); 502 516 } 503 - } 504 - 505 - static inline void nmi_nesting_postprocess(void) 506 - { 507 - if (unlikely(this_cpu_read(update_debug_stack))) { 508 - debug_stack_reset(); 509 - this_cpu_write(update_debug_stack, 0); 510 - } 511 - } 512 517 #endif 513 - 514 - dotraplinkage notrace void 515 - do_nmi(struct pt_regs *regs, long error_code) 516 - { 517 - nmi_nesting_preprocess(regs); 518 518 519 519 nmi_enter(); 520 520 ··· 511 539 512 540 nmi_exit(); 513 541 514 - /* On i386, may loop back to preprocess */ 515 - nmi_nesting_postprocess(); 542 + #ifdef CONFIG_X86_64 543 + if (unlikely(this_cpu_read(update_debug_stack))) { 544 + debug_stack_reset(); 545 + this_cpu_write(update_debug_stack, 0); 546 + } 547 + #endif 548 + 549 + if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) 550 + write_cr2(this_cpu_read(nmi_cr2)); 551 + if (this_cpu_dec_return(nmi_state)) 552 + goto nmi_restart; 516 553 } 517 554 NOKPROBE_SYMBOL(do_nmi); 518 555
+1 -1
arch/x86/kernel/process.c
··· 81 81 */ 82 82 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 83 83 { 84 - *dst = *src; 84 + memcpy(dst, src, arch_task_struct_size); 85 85 86 86 return fpu__copy(&dst->thread.fpu, &src->thread.fpu); 87 87 }
+2 -2
fs/proc/kcore.c
··· 92 92 roundup(sizeof(CORE_STR), 4)) + 93 93 roundup(sizeof(struct elf_prstatus), 4) + 94 94 roundup(sizeof(struct elf_prpsinfo), 4) + 95 - roundup(sizeof(struct task_struct), 4); 95 + roundup(arch_task_struct_size, 4); 96 96 *elf_buflen = PAGE_ALIGN(*elf_buflen); 97 97 return size + *elf_buflen; 98 98 } ··· 415 415 /* set up the task structure */ 416 416 notes[2].name = CORE_STR; 417 417 notes[2].type = NT_TASKSTRUCT; 418 - notes[2].datasz = sizeof(struct task_struct); 418 + notes[2].datasz = arch_task_struct_size; 419 419 notes[2].data = current; 420 420 421 421 nhdr->p_filesz += notesize(&notes[2]);
+14 -2
include/linux/sched.h
··· 1522 1522 /* hung task detection */ 1523 1523 unsigned long last_switch_count; 1524 1524 #endif 1525 - /* CPU-specific state of this task */ 1526 - struct thread_struct thread; 1527 1525 /* filesystem information */ 1528 1526 struct fs_struct *fs; 1529 1527 /* open file information */ ··· 1776 1778 unsigned long task_state_change; 1777 1779 #endif 1778 1780 int pagefault_disabled; 1781 + /* CPU-specific state of this task */ 1782 + struct thread_struct thread; 1783 + /* 1784 + * WARNING: on x86, 'thread_struct' contains a variable-sized 1785 + * structure. It *MUST* be at the end of 'task_struct'. 1786 + * 1787 + * Do not put anything below here! 1788 + */ 1779 1789 }; 1790 + 1791 + #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT 1792 + extern int arch_task_struct_size __read_mostly; 1793 + #else 1794 + # define arch_task_struct_size (sizeof(struct task_struct)) 1795 + #endif 1780 1796 1781 1797 /* Future-safe accessor for struct task_struct's cpus_allowed. */ 1782 1798 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
+6 -1
kernel/fork.c
··· 287 287 max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); 288 288 } 289 289 290 + #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT 291 + /* Initialized by the architecture: */ 292 + int arch_task_struct_size __read_mostly; 293 + #endif 294 + 290 295 void __init fork_init(void) 291 296 { 292 297 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR ··· 300 295 #endif 301 296 /* create a slab on which task_structs can be allocated */ 302 297 task_struct_cachep = 303 - kmem_cache_create("task_struct", sizeof(struct task_struct), 298 + kmem_cache_create("task_struct", arch_task_struct_size, 304 299 ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); 305 300 #endif 306 301