Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'stable/bug-fixes-for-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen

* 'stable/bug-fixes-for-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen:
xen: mask_rw_pte mark RO all pagetable pages up to pgt_buf_top
xen/mmu: Add workaround "x86-64, mm: Put early page table high"

+124 -1
+124 -1
arch/x86/xen/mmu.c
··· 1463 1463 return ret; 1464 1464 } 1465 1465 1466 + #ifdef CONFIG_X86_64 1467 + static __initdata u64 __last_pgt_set_rw = 0; 1468 + static __initdata u64 __pgt_buf_start = 0; 1469 + static __initdata u64 __pgt_buf_end = 0; 1470 + static __initdata u64 __pgt_buf_top = 0; 1471 + /* 1472 + * As a consequence of the commit: 1473 + * 1474 + * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e 1475 + * Author: Yinghai Lu <yinghai@kernel.org> 1476 + * Date: Fri Dec 17 16:58:28 2010 -0800 1477 + * 1478 + * x86-64, mm: Put early page table high 1479 + * 1480 + * at some point init_memory_mapping is going to reach the pagetable pages 1481 + * area and map those pages too (mapping them as normal memory that falls 1482 + * in the range of addresses passed to init_memory_mapping as argument). 1483 + * Some of those pages are already pagetable pages (they are in the range 1484 + * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and 1485 + * everything is fine. 1486 + * Some of these pages are not pagetable pages yet (they fall in the range 1487 + * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they 1488 + * are going to be mapped RW. When these pages become pagetable pages and 1489 + * are hooked into the pagetable, xen will find that the guest has already 1490 + * a RW mapping of them somewhere and fail the operation. 1491 + * The reason Xen requires pagetables to be RO is that the hypervisor needs 1492 + * to verify that the pagetables are valid before using them. The validation 1493 + * operations are called "pinning". 1494 + * 1495 + * In order to fix the issue we mark all the pages in the entire range 1496 + * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation 1497 + * is completed only the range pgt_buf_start-pgt_buf_end is reserved by 1498 + * init_memory_mapping. Hence the kernel is going to crash as soon as one 1499 + * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those 1500 + * ranges are RO). 1501 + * 1502 + * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_ 1503 + * the init_memory_mapping has completed (in a perfect world we would 1504 + * call this function from init_memory_mapping, but lets ignore that). 1505 + * 1506 + * Because we are called _after_ init_memory_mapping the pgt_buf_[start, 1507 + * end,top] have all changed to new values (b/c init_memory_mapping 1508 + * is called and setting up another new page-table). Hence, the first time 1509 + * we enter this function, we save away the pgt_buf_start value and update 1510 + * the pgt_buf_[end,top]. 1511 + * 1512 + * When we detect that the "old" pgt_buf_start through pgt_buf_end 1513 + * PFNs have been reserved (so memblock_x86_reserve_range has been called), 1514 + * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top. 1515 + * 1516 + * And then we update those "old" pgt_buf_[end|top] with the new ones 1517 + * so that we can redo this on the next pagetable. 1518 + */ 1519 + static __init void mark_rw_past_pgt(void) { 1520 + 1521 + if (pgt_buf_end > pgt_buf_start) { 1522 + u64 addr, size; 1523 + 1524 + /* Save it away. */ 1525 + if (!__pgt_buf_start) { 1526 + __pgt_buf_start = pgt_buf_start; 1527 + __pgt_buf_end = pgt_buf_end; 1528 + __pgt_buf_top = pgt_buf_top; 1529 + return; 1530 + } 1531 + /* If we get the range that starts at __pgt_buf_end that means 1532 + * the range is reserved, and that in 'init_memory_mapping' 1533 + * the 'memblock_x86_reserve_range' has been called with the 1534 + * outdated __pgt_buf_start, __pgt_buf_end (the "new" 1535 + * pgt_buf_[start|end|top] refer now to a new pagetable. 1536 + * Note: we are called _after_ the pgt_buf_[..] have been 1537 + * updated.*/ 1538 + 1539 + addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start), 1540 + &size, PAGE_SIZE); 1541 + 1542 + /* Still not reserved, meaning 'memblock_x86_reserve_range' 1543 + * hasn't been called yet. Update the _end and _top.*/ 1544 + if (addr == PFN_PHYS(__pgt_buf_start)) { 1545 + __pgt_buf_end = pgt_buf_end; 1546 + __pgt_buf_top = pgt_buf_top; 1547 + return; 1548 + } 1549 + 1550 + /* OK, the area is reserved, meaning it is time for us to 1551 + * set RW for the old end->top PFNs. */ 1552 + 1553 + /* ..unless we had already done this. */ 1554 + if (__pgt_buf_end == __last_pgt_set_rw) 1555 + return; 1556 + 1557 + addr = PFN_PHYS(__pgt_buf_end); 1558 + 1559 + /* set as RW the rest */ 1560 + printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", 1561 + PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top)); 1562 + 1563 + while (addr < PFN_PHYS(__pgt_buf_top)) { 1564 + make_lowmem_page_readwrite(__va(addr)); 1565 + addr += PAGE_SIZE; 1566 + } 1567 + /* And update everything so that we are ready for the next 1568 + * pagetable (the one created for regions past 4GB) */ 1569 + __last_pgt_set_rw = __pgt_buf_end; 1570 + __pgt_buf_start = pgt_buf_start; 1571 + __pgt_buf_end = pgt_buf_end; 1572 + __pgt_buf_top = pgt_buf_top; 1573 + } 1574 + return; 1575 + } 1576 + #else 1577 + static __init void mark_rw_past_pgt(void) { } 1578 + #endif 1466 1579 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) 1467 1580 { 1468 1581 #ifdef CONFIG_X86_64 ··· 1602 1489 unsigned long pfn = pte_pfn(pte); 1603 1490 1604 1491 /* 1492 + * A bit of optimization. We do not need to call the workaround 1493 + * when xen_set_pte_init is called with a PTE with 0 as PFN. 1494 + * That is b/c the pagetable at that point are just being populated 1495 + * with empty values and we can save some cycles by not calling 1496 + * the 'memblock' code.*/ 1497 + if (pfn) 1498 + mark_rw_past_pgt(); 1499 + /* 1605 1500 * If the new pfn is within the range of the newly allocated 1606 1501 * kernel pagetable, and it isn't being mapped into an 1607 1502 * early_ioremap fixmap slot as a freshly allocated page, make sure 1608 1503 * it is RO. 1609 1504 */ 1610 1505 if (((!is_early_ioremap_ptep(ptep) && 1611 - pfn >= pgt_buf_start && pfn < pgt_buf_end)) || 1506 + pfn >= pgt_buf_start && pfn < pgt_buf_top)) || 1612 1507 (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1))) 1613 1508 pte = pte_wrprotect(pte); 1614 1509 ··· 2118 1997 2119 1998 static __init void xen_post_allocator_init(void) 2120 1999 { 2000 + mark_rw_past_pgt(); 2001 + 2121 2002 #ifdef CONFIG_XEN_DEBUG 2122 2003 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); 2123 2004 #endif