Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

timers/migration: Fix imbalanced NUMA trees

When a CPU from a new node boots, the old root may happen to be
connected to the new root even if their node mismatch, as depicted in
the following scenario:

1) CPU 0 boots and creates the first group for node 0.

[GRP0:0]
node 0
|
CPU 0

2) CPU 1 from node 1 boots and creates a new top that corresponds to
node 1, but it also connects the old root from node 0 to the new root
from node 1 by mistake.

[GRP1:0]
node 1
/ \
/ \
[GRP0:0] [GRP0:1]
node 0 node 1
| |
CPU 0 CPU 1

3) This eventually leads to an imbalanced tree where some node 0 CPUs
migrate node 1 timers (and vice versa) way before reaching the
crossnode groups, resulting in more frequent remote memory accesses
than expected.

[GRP2:0]
NUMA_NO_NODE
/ \
[GRP1:0] [GRP1:1]
node 1 node 0
/ \ |
/ \ [...]
[GRP0:0] [GRP0:1]
node 0 node 1
| |
CPU 0... CPU 1...

A balanced tree should only contain groups having children that belong
to the same node:

[GRP2:0]
NUMA_NO_NODE
/ \
[GRP1:0] [GRP1:0]
node 0 node 1
/ \ / \
/ \ / \
[GRP0:0] [...] [...] [GRP0:1]
node 0 node 1
| |
CPU 0... CPU 1...

In order to fix this, the hierarchy must be unfolded up to the crossnode
level as soon as a node mismatch is detected. For example the stage 2
above should lead to this layout:

[GRP2:0]
NUMA_NO_NODE
/ \
[GRP1:0] [GRP1:1]
node 0 node 1
/ \
/ \
[GRP0:0] [GRP0:1]
node 0 node 1
| |
CPU 0 CPU 1

This means that not only GRP1:0 must be created but also GRP1:1 and
GRP2:0 in order to prepare a balanced tree for next CPUs to boot.

Fixes: 7ee988770326 ("timers: Implement the hierarchical pull model")
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251024132536.39841-4-frederic@kernel.org

authored by

Frederic Weisbecker and committed by
Thomas Gleixner
5eb579df fa962035

+134 -111
+134 -111
kernel/time/timer_migration.c
··· 420 420 static unsigned int tmigr_hierarchy_levels __read_mostly; 421 421 static unsigned int tmigr_crossnode_level __read_mostly; 422 422 423 + static struct tmigr_group *tmigr_root; 424 + 423 425 static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu); 424 426 425 427 #define TMIGR_NONE 0xFF ··· 524 522 525 523 typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *); 526 524 527 - static void __walk_groups(up_f up, struct tmigr_walk *data, 528 - struct tmigr_cpu *tmc) 525 + static void __walk_groups_from(up_f up, struct tmigr_walk *data, 526 + struct tmigr_group *child, struct tmigr_group *group) 529 527 { 530 - struct tmigr_group *child = NULL, *group = tmc->tmgroup; 531 - 532 528 do { 533 529 WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels); 534 530 ··· 542 542 data->childmask = child->groupmask; 543 543 WARN_ON_ONCE(!data->childmask); 544 544 } while (group); 545 + } 546 + 547 + static void __walk_groups(up_f up, struct tmigr_walk *data, 548 + struct tmigr_cpu *tmc) 549 + { 550 + __walk_groups_from(up, data, NULL, tmc->tmgroup); 545 551 } 546 552 547 553 static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc) ··· 1504 1498 s.seq = 0; 1505 1499 atomic_set(&group->migr_state, s.state); 1506 1500 1507 - /* 1508 - * If this is a new top-level, prepare its groupmask in advance. 1509 - * This avoids accidents where yet another new top-level is 1510 - * created in the future and made visible before the current groupmask. 1511 - */ 1512 - if (list_empty(&tmigr_level_list[lvl])) { 1513 - group->groupmask = BIT(0); 1514 - /* 1515 - * The previous top level has prepared its groupmask already, 1516 - * simply account it as the first child. 1517 - */ 1518 - if (lvl > 0) 1519 - group->num_children = 1; 1520 - } 1521 - 1522 1501 timerqueue_init_head(&group->events); 1523 1502 timerqueue_init(&group->groupevt.nextevt); 1524 1503 group->groupevt.nextevt.expires = KTIME_MAX; ··· 1558 1567 return group; 1559 1568 } 1560 1569 1570 + static bool tmigr_init_root(struct tmigr_group *group, bool activate) 1571 + { 1572 + if (!group->parent && group != tmigr_root) { 1573 + /* 1574 + * This is the new top-level, prepare its groupmask in advance 1575 + * to avoid accidents where yet another new top-level is 1576 + * created in the future and made visible before this groupmask. 1577 + */ 1578 + group->groupmask = BIT(0); 1579 + WARN_ON_ONCE(activate); 1580 + 1581 + return true; 1582 + } 1583 + 1584 + return false; 1585 + 1586 + } 1587 + 1561 1588 static void tmigr_connect_child_parent(struct tmigr_group *child, 1562 1589 struct tmigr_group *parent, 1563 1590 bool activate) 1564 1591 { 1565 - struct tmigr_walk data; 1566 - 1567 - if (activate) { 1592 + if (tmigr_init_root(parent, activate)) { 1568 1593 /* 1569 - * @child is the old top and @parent the new one. In this 1570 - * case groupmask is pre-initialized and @child already 1571 - * accounted, along with its new sibling corresponding to the 1572 - * CPU going up. 1594 + * The previous top level had prepared its groupmask already, 1595 + * simply account it in advance as the first child. If some groups 1596 + * have been created between the old and new root due to node 1597 + * mismatch, the new root's child will be intialized accordingly. 1573 1598 */ 1574 - WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2); 1599 + parent->num_children = 1; 1600 + } 1601 + 1602 + /* Connecting old root to new root ? */ 1603 + if (!parent->parent && activate) { 1604 + /* 1605 + * @child is the old top, or in case of node mismatch, some 1606 + * intermediate group between the old top and the new one in 1607 + * @parent. In this case the @child must be pre-accounted above 1608 + * as the first child. Its new inactive sibling corresponding 1609 + * to the CPU going up has been accounted as the second child. 1610 + */ 1611 + WARN_ON_ONCE(parent->num_children != 2); 1612 + child->groupmask = BIT(0); 1575 1613 } else { 1576 - /* Adding @child for the CPU going up to @parent. */ 1614 + /* Common case adding @child for the CPU going up to @parent. */ 1577 1615 child->groupmask = BIT(parent->num_children++); 1578 1616 } 1579 1617 ··· 1614 1594 smp_store_release(&child->parent, parent); 1615 1595 1616 1596 trace_tmigr_connect_child_parent(child); 1617 - 1618 - if (!activate) 1619 - return; 1620 - 1621 - /* 1622 - * To prevent inconsistent states, active children need to be active in 1623 - * the new parent as well. Inactive children are already marked inactive 1624 - * in the parent group: 1625 - * 1626 - * * When new groups were created by tmigr_setup_groups() starting from 1627 - * the lowest level (and not higher then one level below the current 1628 - * top level), then they are not active. They will be set active when 1629 - * the new online CPU comes active. 1630 - * 1631 - * * But if a new group above the current top level is required, it is 1632 - * mandatory to propagate the active state of the already existing 1633 - * child to the new parent. So tmigr_connect_child_parent() is 1634 - * executed with the formerly top level group (child) and the newly 1635 - * created group (parent). 1636 - * 1637 - * * It is ensured that the child is active, as this setup path is 1638 - * executed in hotplug prepare callback. This is exectued by an 1639 - * already connected and !idle CPU. Even if all other CPUs go idle, 1640 - * the CPU executing the setup will be responsible up to current top 1641 - * level group. And the next time it goes inactive, it will release 1642 - * the new childmask and parent to subsequent walkers through this 1643 - * @child. Therefore propagate active state unconditionally. 1644 - */ 1645 - data.childmask = child->groupmask; 1646 - 1647 - /* 1648 - * There is only one new level per time (which is protected by 1649 - * tmigr_mutex). When connecting the child and the parent and set the 1650 - * child active when the parent is inactive, the parent needs to be the 1651 - * uppermost level. Otherwise there went something wrong! 1652 - */ 1653 - WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); 1654 1597 } 1655 1598 1656 - static int tmigr_setup_groups(unsigned int cpu, unsigned int node) 1599 + static int tmigr_setup_groups(unsigned int cpu, unsigned int node, 1600 + struct tmigr_group *start, bool activate) 1657 1601 { 1658 1602 struct tmigr_group *group, *child, **stack; 1659 - int i, top = 0, err = 0; 1660 - struct list_head *lvllist; 1603 + int i, top = 0, err = 0, start_lvl = 0; 1604 + bool root_mismatch = false; 1661 1605 1662 1606 stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL); 1663 1607 if (!stack) 1664 1608 return -ENOMEM; 1665 1609 1666 - for (i = 0; i < tmigr_hierarchy_levels; i++) { 1610 + if (start) { 1611 + stack[start->level] = start; 1612 + start_lvl = start->level + 1; 1613 + } 1614 + 1615 + if (tmigr_root) 1616 + root_mismatch = tmigr_root->numa_node != node; 1617 + 1618 + for (i = start_lvl; i < tmigr_hierarchy_levels; i++) { 1667 1619 group = tmigr_get_group(cpu, node, i); 1668 1620 if (IS_ERR(group)) { 1669 1621 err = PTR_ERR(group); ··· 1648 1656 1649 1657 /* 1650 1658 * When booting only less CPUs of a system than CPUs are 1651 - * available, not all calculated hierarchy levels are required. 1659 + * available, not all calculated hierarchy levels are required, 1660 + * unless a node mismatch is detected. 1652 1661 * 1653 1662 * The loop is aborted as soon as the highest level, which might 1654 1663 * be different from tmigr_hierarchy_levels, contains only a 1655 - * single group. 1664 + * single group, unless the nodes mismatch below tmigr_crossnode_level 1656 1665 */ 1657 - if (group->parent || list_is_singular(&tmigr_level_list[i])) 1666 + if (group->parent) 1667 + break; 1668 + if ((!root_mismatch || i >= tmigr_crossnode_level) && 1669 + list_is_singular(&tmigr_level_list[i])) 1658 1670 break; 1659 1671 } 1660 1672 1661 1673 /* Assert single root without parent */ 1662 1674 if (WARN_ON_ONCE(i >= tmigr_hierarchy_levels)) 1663 1675 return -EINVAL; 1664 - if (WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top]))) 1665 - return -EINVAL; 1666 1676 1667 - for (; i >= 0; i--) { 1677 + for (; i >= start_lvl; i--) { 1668 1678 group = stack[i]; 1669 1679 1670 1680 if (err < 0) { ··· 1686 1692 tmc->tmgroup = group; 1687 1693 tmc->groupmask = BIT(group->num_children++); 1688 1694 1695 + tmigr_init_root(group, activate); 1696 + 1689 1697 trace_tmigr_connect_cpu_parent(tmc); 1690 1698 1691 1699 /* There are no children that need to be connected */ 1692 1700 continue; 1693 1701 } else { 1694 1702 child = stack[i - 1]; 1695 - /* Will be activated at online time */ 1696 - tmigr_connect_child_parent(child, group, false); 1697 - } 1698 - 1699 - /* check if uppermost level was newly created */ 1700 - if (top != i) 1701 - continue; 1702 - 1703 - WARN_ON_ONCE(top == 0); 1704 - 1705 - lvllist = &tmigr_level_list[top]; 1706 - 1707 - /* 1708 - * Newly created root level should have accounted the upcoming 1709 - * CPU's child group and pre-accounted the old root. 1710 - */ 1711 - if (group->num_children == 2 && list_is_singular(lvllist)) { 1712 - /* 1713 - * The target CPU must never do the prepare work, except 1714 - * on early boot when the boot CPU is the target. Otherwise 1715 - * it may spuriously activate the old top level group inside 1716 - * the new one (nevertheless whether old top level group is 1717 - * active or not) and/or release an uninitialized childmask. 1718 - */ 1719 - WARN_ON_ONCE(cpu == raw_smp_processor_id()); 1720 - 1721 - lvllist = &tmigr_level_list[top - 1]; 1722 - list_for_each_entry(child, lvllist, list) { 1723 - if (child->parent) 1724 - continue; 1725 - 1726 - tmigr_connect_child_parent(child, group, true); 1727 - } 1703 + tmigr_connect_child_parent(child, group, activate); 1728 1704 } 1729 1705 } 1730 1706 1707 + if (err < 0) 1708 + goto out; 1709 + 1710 + if (activate) { 1711 + struct tmigr_walk data; 1712 + 1713 + /* 1714 + * To prevent inconsistent states, active children need to be active in 1715 + * the new parent as well. Inactive children are already marked inactive 1716 + * in the parent group: 1717 + * 1718 + * * When new groups were created by tmigr_setup_groups() starting from 1719 + * the lowest level, then they are not active. They will be set active 1720 + * when the new online CPU comes active. 1721 + * 1722 + * * But if new groups above the current top level are required, it is 1723 + * mandatory to propagate the active state of the already existing 1724 + * child to the new parents. So tmigr_active_up() activates the 1725 + * new parents while walking up from the old root to the new. 1726 + * 1727 + * * It is ensured that @start is active, as this setup path is 1728 + * executed in hotplug prepare callback. This is executed by an 1729 + * already connected and !idle CPU. Even if all other CPUs go idle, 1730 + * the CPU executing the setup will be responsible up to current top 1731 + * level group. And the next time it goes inactive, it will release 1732 + * the new childmask and parent to subsequent walkers through this 1733 + * @child. Therefore propagate active state unconditionally. 1734 + */ 1735 + WARN_ON_ONCE(!start->parent); 1736 + data.childmask = start->groupmask; 1737 + __walk_groups_from(tmigr_active_up, &data, start, start->parent); 1738 + } 1739 + 1740 + /* Root update */ 1741 + if (list_is_singular(&tmigr_level_list[top])) { 1742 + group = list_first_entry(&tmigr_level_list[top], 1743 + typeof(*group), list); 1744 + WARN_ON_ONCE(group->parent); 1745 + if (tmigr_root) { 1746 + /* Old root should be the same or below */ 1747 + WARN_ON_ONCE(tmigr_root->level > top); 1748 + } 1749 + tmigr_root = group; 1750 + } 1751 + out: 1731 1752 kfree(stack); 1732 1753 1733 1754 return err; ··· 1750 1741 1751 1742 static int tmigr_add_cpu(unsigned int cpu) 1752 1743 { 1744 + struct tmigr_group *old_root = tmigr_root; 1753 1745 int node = cpu_to_node(cpu); 1754 1746 int ret; 1755 1747 1756 - mutex_lock(&tmigr_mutex); 1757 - ret = tmigr_setup_groups(cpu, node); 1758 - mutex_unlock(&tmigr_mutex); 1748 + guard(mutex)(&tmigr_mutex); 1749 + 1750 + ret = tmigr_setup_groups(cpu, node, NULL, false); 1751 + 1752 + /* Root has changed? Connect the old one to the new */ 1753 + if (ret >= 0 && old_root && old_root != tmigr_root) { 1754 + /* 1755 + * The target CPU must never do the prepare work, except 1756 + * on early boot when the boot CPU is the target. Otherwise 1757 + * it may spuriously activate the old top level group inside 1758 + * the new one (nevertheless whether old top level group is 1759 + * active or not) and/or release an uninitialized childmask. 1760 + */ 1761 + WARN_ON_ONCE(cpu == raw_smp_processor_id()); 1762 + ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true); 1763 + } 1759 1764 1760 1765 return ret; 1761 1766 }