Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

sched/topology: Make sched_init_numa() use a set for the deduplicating sort

The deduplicating sort in sched_init_numa() assumes that the first line in
the distance table contains all unique values in the entire table. I've
been trying to pen what this exactly means for the topology, but it's not
straightforward. For instance, topology.c uses this example:

node 0 1 2 3
0: 10 20 20 30
1: 20 10 20 20
2: 20 20 10 20
3: 30 20 20 10

0 ----- 1
| / |
| / |
| / |
2 ----- 3

Which works out just fine. However, if we swap nodes 0 and 1:

1 ----- 0
| / |
| / |
| / |
2 ----- 3

we get this distance table:

node 0 1 2 3
0: 10 20 20 20
1: 20 10 20 30
2: 20 20 10 20
3: 20 30 20 10

Which breaks the deduplicating sort (non-representative first line). In
this case this would just be a renumbering exercise, but it so happens that
we can have a deduplicating sort that goes through the whole table in O(n²)
at the extra cost of a temporary memory allocation (i.e. any form of set).

The ACPI spec (SLIT) mentions distances are encoded on 8 bits. Following
this, implement the set as a 256-bits bitmap. Should this not be
satisfactory (i.e. we want to support 32-bit values), then we'll have to go
for some other sparse set implementation.

This has the added benefit of letting us allocate just the right amount of
memory for sched_domains_numa_distance[], rather than an arbitrary
(nr_node_ids + 1).

Note: DT binding equivalent (distance-map) decodes distances as 32-bit
values.

Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210122123943.1217-2-valentin.schneider@arm.com

authored by

Valentin Schneider and committed by
Peter Zijlstra
620a6dc4 0ae78eec

+50 -52
+1
include/linux/topology.h
··· 48 48 /* Conform to ACPI 2.0 SLIT distance definitions */ 49 49 #define LOCAL_DISTANCE 10 50 50 #define REMOTE_DISTANCE 20 51 + #define DISTANCE_BITS 8 51 52 #ifndef node_distance 52 53 #define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE) 53 54 #endif
+49 -52
kernel/sched/topology.c
··· 1596 1596 } 1597 1597 } 1598 1598 1599 + 1600 + #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) 1601 + 1599 1602 void sched_init_numa(void) 1600 1603 { 1601 - int next_distance, curr_distance = node_distance(0, 0); 1602 1604 struct sched_domain_topology_level *tl; 1603 - int level = 0; 1604 - int i, j, k; 1605 - 1606 - sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL); 1607 - if (!sched_domains_numa_distance) 1608 - return; 1609 - 1610 - /* Includes NUMA identity node at level 0. */ 1611 - sched_domains_numa_distance[level++] = curr_distance; 1612 - sched_domains_numa_levels = level; 1605 + unsigned long *distance_map; 1606 + int nr_levels = 0; 1607 + int i, j; 1613 1608 1614 1609 /* 1615 1610 * O(nr_nodes^2) deduplicating selection sort -- in order to find the 1616 1611 * unique distances in the node_distance() table. 1617 - * 1618 - * Assumes node_distance(0,j) includes all distances in 1619 - * node_distance(i,j) in order to avoid cubic time. 1620 1612 */ 1621 - next_distance = curr_distance; 1613 + distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); 1614 + if (!distance_map) 1615 + return; 1616 + 1617 + bitmap_zero(distance_map, NR_DISTANCE_VALUES); 1622 1618 for (i = 0; i < nr_node_ids; i++) { 1623 1619 for (j = 0; j < nr_node_ids; j++) { 1624 - for (k = 0; k < nr_node_ids; k++) { 1625 - int distance = node_distance(i, k); 1620 + int distance = node_distance(i, j); 1626 1621 1627 - if (distance > curr_distance && 1628 - (distance < next_distance || 1629 - next_distance == curr_distance)) 1630 - next_distance = distance; 1631 - 1632 - /* 1633 - * While not a strong assumption it would be nice to know 1634 - * about cases where if node A is connected to B, B is not 1635 - * equally connected to A. 1636 - */ 1637 - if (sched_debug() && node_distance(k, i) != distance) 1638 - sched_numa_warn("Node-distance not symmetric"); 1639 - 1640 - if (sched_debug() && i && !find_numa_distance(distance)) 1641 - sched_numa_warn("Node-0 not representative"); 1622 + if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { 1623 + sched_numa_warn("Invalid distance value range"); 1624 + return; 1642 1625 } 1643 - if (next_distance != curr_distance) { 1644 - sched_domains_numa_distance[level++] = next_distance; 1645 - sched_domains_numa_levels = level; 1646 - curr_distance = next_distance; 1647 - } else break; 1648 - } 1649 1626 1650 - /* 1651 - * In case of sched_debug() we verify the above assumption. 1652 - */ 1653 - if (!sched_debug()) 1654 - break; 1627 + bitmap_set(distance_map, distance, 1); 1628 + } 1629 + } 1630 + /* 1631 + * We can now figure out how many unique distance values there are and 1632 + * allocate memory accordingly. 1633 + */ 1634 + nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); 1635 + 1636 + sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); 1637 + if (!sched_domains_numa_distance) { 1638 + bitmap_free(distance_map); 1639 + return; 1655 1640 } 1656 1641 1642 + for (i = 0, j = 0; i < nr_levels; i++, j++) { 1643 + j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); 1644 + sched_domains_numa_distance[i] = j; 1645 + } 1646 + 1647 + bitmap_free(distance_map); 1648 + 1657 1649 /* 1658 - * 'level' contains the number of unique distances 1650 + * 'nr_levels' contains the number of unique distances 1659 1651 * 1660 1652 * The sched_domains_numa_distance[] array includes the actual distance 1661 1653 * numbers. ··· 1656 1664 /* 1657 1665 * Here, we should temporarily reset sched_domains_numa_levels to 0. 1658 1666 * If it fails to allocate memory for array sched_domains_numa_masks[][], 1659 - * the array will contain less then 'level' members. This could be 1667 + * the array will contain less then 'nr_levels' members. This could be 1660 1668 * dangerous when we use it to iterate array sched_domains_numa_masks[][] 1661 1669 * in other functions. 1662 1670 * 1663 - * We reset it to 'level' at the end of this function. 1671 + * We reset it to 'nr_levels' at the end of this function. 1664 1672 */ 1665 1673 sched_domains_numa_levels = 0; 1666 1674 1667 - sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); 1675 + sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); 1668 1676 if (!sched_domains_numa_masks) 1669 1677 return; 1670 1678 ··· 1672 1680 * Now for each level, construct a mask per node which contains all 1673 1681 * CPUs of nodes that are that many hops away from us. 1674 1682 */ 1675 - for (i = 0; i < level; i++) { 1683 + for (i = 0; i < nr_levels; i++) { 1676 1684 sched_domains_numa_masks[i] = 1677 1685 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); 1678 1686 if (!sched_domains_numa_masks[i]) ··· 1680 1688 1681 1689 for (j = 0; j < nr_node_ids; j++) { 1682 1690 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); 1691 + int k; 1692 + 1683 1693 if (!mask) 1684 1694 return; 1685 1695 1686 1696 sched_domains_numa_masks[i][j] = mask; 1687 1697 1688 1698 for_each_node(k) { 1699 + if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) 1700 + sched_numa_warn("Node-distance not symmetric"); 1701 + 1689 1702 if (node_distance(j, k) > sched_domains_numa_distance[i]) 1690 1703 continue; 1691 1704 ··· 1702 1705 /* Compute default topology size */ 1703 1706 for (i = 0; sched_domain_topology[i].mask; i++); 1704 1707 1705 - tl = kzalloc((i + level + 1) * 1708 + tl = kzalloc((i + nr_levels) * 1706 1709 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 1707 1710 if (!tl) 1708 1711 return; ··· 1725 1728 /* 1726 1729 * .. and append 'j' levels of NUMA goodness. 1727 1730 */ 1728 - for (j = 1; j < level; i++, j++) { 1731 + for (j = 1; j < nr_levels; i++, j++) { 1729 1732 tl[i] = (struct sched_domain_topology_level){ 1730 1733 .mask = sd_numa_mask, 1731 1734 .sd_flags = cpu_numa_flags, ··· 1737 1740 1738 1741 sched_domain_topology = tl; 1739 1742 1740 - sched_domains_numa_levels = level; 1741 - sched_max_numa_distance = sched_domains_numa_distance[level - 1]; 1743 + sched_domains_numa_levels = nr_levels; 1744 + sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1]; 1742 1745 1743 1746 init_numa_topology_type(); 1744 1747 }