lib/group_cpus: make group CPU cluster aware

As CPU core counts increase, the number of NVMe IRQs may be smaller than
the total number of CPUs. This forces multiple CPUs to share the same
IRQ. If the IRQ affinity and the CPU's cluster do not align, a
performance penalty can be observed on some platforms.

This patch improves IRQ affinity by grouping CPUs by cluster within each
NUMA domain, ensuring better locality between CPUs and their assigned NVMe
IRQs.

Details:

Intel Xeon E platform packs 4 CPU cores as 1 module (cluster) and share
the L2 cache. Let's say, if there are 40 CPUs in 1 NUMA domain and 11
IRQs to dispatch. The existing algorithm will map first 7 IRQs each with
4 CPUs and remained 4 IRQs each with 3 CPUs. The last 4 IRQs may have
cross cluster issue. For example, the 9th IRQ which pinned to CPU32, then
for CPU31, it will have cross L2 memory access.

CPU |28 29 30 31|32 33 34 35|36 ...
-------- -------- --------
IRQ 8 9 10

If this patch applied, then first 2 IRQs each mapped with 2 CPUs and rest
9 IRQs each mapped with 4 CPUs, which avoids the cross cluster memory
access.

CPU |00 01 02 03|04 05 06 07|08 09 10 11| ...
----- ----- ----------- -----------
IRQ 1 2 3 4

As a result, 15%+ performance difference is observed in FIO
libaio/randread/bs=8k.

Changes since V1:
- Add more performance details in commit messages.
- Fix endless loop when topology_cluster_cpumask return invalid mask.

History:
v1: https://lore.kernel.org/all/20251024023038.872616-1-wangyang.guo@intel.com/
v1 [RESEND]: https://lore.kernel.org/all/20251111020608.1501543-1-wangyang.guo@intel.com/

Link: https://lkml.kernel.org/r/20260113022958.3379650-1-wangyang.guo@intel.com
Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
Reviewed-by: Tianyou Li <tianyou.li@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Tested-by: Dan Liang <dan.liang@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@fb.com>
Cc: Keith Busch <kbusch@kernel.org>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Radu Rendec <rrendec@redhat.com>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Wangyang Guo and committed by

Andrew Morton 4 months ago 89802ca3 9a229ae2

+206 -65

1 changed file

expand all

lib

group_cpus.c

+206 -65

lib/group_cpus.c

··· 114 114 return ln->ncpus - rn->ncpus; 115 115 } 116 116 117 - /* 118 - * Allocate group number for each node, so that for each node: 119 - * 120 - * 1) the allocated number is >= 1 121 - * 122 - * 2) the allocated number is <= active CPU number of this node 123 - * 124 - * The actual allocated total groups may be less than @numgrps when 125 - * active total CPU number is less than @numgrps. 126 - * 127 - * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' 128 - * for each node. 129 - */ 130 - static void alloc_nodes_groups(unsigned int numgrps, 131 - cpumask_var_t *node_to_cpumask, 132 - const struct cpumask *cpu_mask, 133 - const nodemask_t nodemsk, 134 - struct cpumask *nmsk, 135 - struct node_groups *node_groups) 117 + static void alloc_groups_to_nodes(unsigned int numgrps, 118 + unsigned int numcpus, 119 + struct node_groups *node_groups, 120 + unsigned int num_nodes) 136 121 { 137 - unsigned n, remaining_ncpus = 0; 122 + unsigned int n, remaining_ncpus = numcpus; 123 + unsigned int ngroups, ncpus; 138 124 139 - for (n = 0; n < nr_node_ids; n++) { 140 - node_groups[n].id = n; 141 - node_groups[n].ncpus = UINT_MAX; 142 - } 143 - 144 - for_each_node_mask(n, nodemsk) { 145 - unsigned ncpus; 146 - 147 - cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); 148 - ncpus = cpumask_weight(nmsk); 149 - 150 - if (!ncpus) 151 - continue; 152 - remaining_ncpus += ncpus; 153 - node_groups[n].ncpus = ncpus; 154 - } 155 - 156 - numgrps = min_t(unsigned, remaining_ncpus, numgrps); 157 - 158 - sort(node_groups, nr_node_ids, sizeof(node_groups[0]), 125 + sort(node_groups, num_nodes, sizeof(node_groups[0]), 159 126 ncpus_cmp_func, NULL); 160 127 161 128 /* ··· 193 226 * finally for each node X: grps(X) <= ncpu(X). 194 227 * 195 228 */ 196 - for (n = 0; n < nr_node_ids; n++) { 197 - unsigned ngroups, ncpus; 198 229 230 + for (n = 0; n < num_nodes; n++) { 199 231 if (node_groups[n].ncpus == UINT_MAX) 200 232 continue; 201 233 ··· 212 246 } 213 247 } 214 248 249 + /* 250 + * Allocate group number for each node, so that for each node: 251 + * 252 + * 1) the allocated number is >= 1 253 + * 254 + * 2) the allocated number is <= active CPU number of this node 255 + * 256 + * The actual allocated total groups may be less than @numgrps when 257 + * active total CPU number is less than @numgrps. 258 + * 259 + * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' 260 + * for each node. 261 + */ 262 + static void alloc_nodes_groups(unsigned int numgrps, 263 + cpumask_var_t *node_to_cpumask, 264 + const struct cpumask *cpu_mask, 265 + const nodemask_t nodemsk, 266 + struct cpumask *nmsk, 267 + struct node_groups *node_groups) 268 + { 269 + unsigned int n, numcpus = 0; 270 + 271 + for (n = 0; n < nr_node_ids; n++) { 272 + node_groups[n].id = n; 273 + node_groups[n].ncpus = UINT_MAX; 274 + } 275 + 276 + for_each_node_mask(n, nodemsk) { 277 + unsigned int ncpus; 278 + 279 + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); 280 + ncpus = cpumask_weight(nmsk); 281 + 282 + if (!ncpus) 283 + continue; 284 + numcpus += ncpus; 285 + node_groups[n].ncpus = ncpus; 286 + } 287 + 288 + numgrps = min_t(unsigned int, numcpus, numgrps); 289 + alloc_groups_to_nodes(numgrps, numcpus, node_groups, nr_node_ids); 290 + } 291 + 292 + static void assign_cpus_to_groups(unsigned int ncpus, 293 + struct cpumask *nmsk, 294 + struct node_groups *nv, 295 + struct cpumask *masks, 296 + unsigned int *curgrp, 297 + unsigned int last_grp) 298 + { 299 + unsigned int v, cpus_per_grp, extra_grps; 300 + /* Account for rounding errors */ 301 + extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); 302 + 303 + /* Spread allocated groups on CPUs of the current node */ 304 + for (v = 0; v < nv->ngroups; v++, *curgrp += 1) { 305 + cpus_per_grp = ncpus / nv->ngroups; 306 + 307 + /* Account for extra groups to compensate rounding errors */ 308 + if (extra_grps) { 309 + cpus_per_grp++; 310 + --extra_grps; 311 + } 312 + 313 + /* 314 + * wrapping has to be considered given 'startgrp' 315 + * may start anywhere 316 + */ 317 + if (*curgrp >= last_grp) 318 + *curgrp = 0; 319 + grp_spread_init_one(&masks[*curgrp], nmsk, cpus_per_grp); 320 + } 321 + } 322 + 323 + static int alloc_cluster_groups(unsigned int ncpus, 324 + unsigned int ngroups, 325 + struct cpumask *node_cpumask, 326 + cpumask_var_t msk, 327 + const struct cpumask ***clusters_ptr, 328 + struct node_groups **cluster_groups_ptr) 329 + { 330 + unsigned int ncluster = 0; 331 + unsigned int cpu, nc, n; 332 + const struct cpumask *cluster_mask; 333 + const struct cpumask **clusters; 334 + struct node_groups *cluster_groups; 335 + 336 + cpumask_copy(msk, node_cpumask); 337 + 338 + /* Probe how many clusters in this node. */ 339 + while (1) { 340 + cpu = cpumask_first(msk); 341 + if (cpu >= nr_cpu_ids) 342 + break; 343 + 344 + cluster_mask = topology_cluster_cpumask(cpu); 345 + if (!cpumask_weight(cluster_mask)) 346 + goto no_cluster; 347 + /* Clean out CPUs on the same cluster. */ 348 + cpumask_andnot(msk, msk, cluster_mask); 349 + ncluster++; 350 + } 351 + 352 + /* If ngroups < ncluster, cross cluster is inevitable, skip. */ 353 + if (ncluster == 0 || ncluster > ngroups) 354 + goto no_cluster; 355 + 356 + /* Allocate memory based on cluster number. */ 357 + clusters = kcalloc(ncluster, sizeof(struct cpumask *), GFP_KERNEL); 358 + if (!clusters) 359 + goto no_cluster; 360 + cluster_groups = kcalloc(ncluster, sizeof(struct node_groups), GFP_KERNEL); 361 + if (!cluster_groups) 362 + goto fail_cluster_groups; 363 + 364 + /* Filling cluster info for later process. */ 365 + cpumask_copy(msk, node_cpumask); 366 + for (n = 0; n < ncluster; n++) { 367 + cpu = cpumask_first(msk); 368 + cluster_mask = topology_cluster_cpumask(cpu); 369 + nc = cpumask_weight_and(cluster_mask, node_cpumask); 370 + clusters[n] = cluster_mask; 371 + cluster_groups[n].id = n; 372 + cluster_groups[n].ncpus = nc; 373 + cpumask_andnot(msk, msk, cluster_mask); 374 + } 375 + 376 + alloc_groups_to_nodes(ngroups, ncpus, cluster_groups, ncluster); 377 + 378 + *clusters_ptr = clusters; 379 + *cluster_groups_ptr = cluster_groups; 380 + return ncluster; 381 + 382 + fail_cluster_groups: 383 + kfree(clusters); 384 + no_cluster: 385 + return 0; 386 + } 387 + 388 + /* 389 + * Try group CPUs evenly for cluster locality within a NUMA node. 390 + * 391 + * Return: true if success, false otherwise. 392 + */ 393 + static bool __try_group_cluster_cpus(unsigned int ncpus, 394 + unsigned int ngroups, 395 + struct cpumask *node_cpumask, 396 + struct cpumask *masks, 397 + unsigned int *curgrp, 398 + unsigned int last_grp) 399 + { 400 + struct node_groups *cluster_groups; 401 + const struct cpumask **clusters; 402 + unsigned int ncluster; 403 + bool ret = false; 404 + cpumask_var_t nmsk; 405 + unsigned int i, nc; 406 + 407 + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) 408 + goto fail_nmsk_alloc; 409 + 410 + ncluster = alloc_cluster_groups(ncpus, ngroups, node_cpumask, nmsk, 411 + &clusters, &cluster_groups); 412 + 413 + if (ncluster == 0) 414 + goto fail_no_clusters; 415 + 416 + for (i = 0; i < ncluster; i++) { 417 + struct node_groups *nv = &cluster_groups[i]; 418 + 419 + /* Get the cpus on this cluster. */ 420 + cpumask_and(nmsk, node_cpumask, clusters[nv->id]); 421 + nc = cpumask_weight(nmsk); 422 + if (!nc) 423 + continue; 424 + WARN_ON_ONCE(nv->ngroups > nc); 425 + 426 + assign_cpus_to_groups(nc, nmsk, nv, masks, curgrp, last_grp); 427 + } 428 + 429 + ret = true; 430 + kfree(cluster_groups); 431 + kfree(clusters); 432 + fail_no_clusters: 433 + free_cpumask_var(nmsk); 434 + fail_nmsk_alloc: 435 + return ret; 436 + } 437 + 215 438 static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, 216 439 cpumask_var_t *node_to_cpumask, 217 440 const struct cpumask *cpu_mask, 218 441 struct cpumask *nmsk, struct cpumask *masks) 219 442 { 220 - unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0; 443 + unsigned int i, n, nodes, done = 0; 221 444 unsigned int last_grp = numgrps; 222 445 unsigned int curgrp = startgrp; 223 446 nodemask_t nodemsk = NODE_MASK_NONE; ··· 442 287 alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, 443 288 nodemsk, nmsk, node_groups); 444 289 for (i = 0; i < nr_node_ids; i++) { 445 - unsigned int ncpus, v; 290 + unsigned int ncpus; 446 291 struct node_groups *nv = &node_groups[i]; 447 292 448 293 if (nv->ngroups == UINT_MAX) ··· 456 301 457 302 WARN_ON_ONCE(nv->ngroups > ncpus); 458 303 459 - /* Account for rounding errors */ 460 - extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); 461 - 462 - /* Spread allocated groups on CPUs of the current node */ 463 - for (v = 0; v < nv->ngroups; v++, curgrp++) { 464 - cpus_per_grp = ncpus / nv->ngroups; 465 - 466 - /* Account for extra groups to compensate rounding errors */ 467 - if (extra_grps) { 468 - cpus_per_grp++; 469 - --extra_grps; 470 - } 471 - 472 - /* 473 - * wrapping has to be considered given 'startgrp' 474 - * may start anywhere 475 - */ 476 - if (curgrp >= last_grp) 477 - curgrp = 0; 478 - grp_spread_init_one(&masks[curgrp], nmsk, 479 - cpus_per_grp); 304 + if (__try_group_cluster_cpus(ncpus, nv->ngroups, nmsk, 305 + masks, &curgrp, last_grp)) { 306 + done += nv->ngroups; 307 + continue; 480 308 } 309 + 310 + assign_cpus_to_groups(ncpus, nmsk, nv, masks, &curgrp, 311 + last_grp); 481 312 done += nv->ngroups; 482 313 } 483 314 kfree(node_groups);

Configure Feed

Configure Feed