gve: make IRQ handlers and page allocation NUMA aware

All memory in GVE is currently allocated without regard for the NUMA
node of the device. Because access to NUMA-local memory access is
significantly cheaper than access to a remote node, this change attempts
to ensure that page frags used in the RX path, including page pool
frags, are allocated on the NUMA node local to the gVNIC device. Note
that this attempt is best-effort. If necessary, the driver will still
allocate non-local memory, as __GFP_THISNODE is not passed. Descriptor
ring allocations are not updated, as dma_alloc_coherent handles that.

This change also modifies the IRQ affinity setting to only select CPUs
from the node local to the device, preserving the behavior that TX and
RX queues of the same index share CPU affinity.

Signed-off-by: Bailey Forrest <bcf@google.com>
Signed-off-by: Joshua Washington <joshwash@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Harshitha Ramamurthy <hramamurthy@google.com>
Signed-off-by: Jeroen de Borst <jeroendb@google.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250707210107.2742029-1-jeroendb@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Bailey Forrest and committed by

Jakub Kicinski 11 months ago d991666b 11b5d56d

+37 -17

5 changed files

expand all

drivers

net

ethernet

google

gve

gve.h

gve_buffer_mgmt_dqo.c

gve_main.c

gve_rx.c

gve_rx_dqo.c

drivers/net/ethernet/google/gve/gve.h

··· 804 804 struct gve_tx_queue_config tx_cfg; 805 805 struct gve_rx_queue_config rx_cfg; 806 806 u32 num_ntfy_blks; /* split between TX and RX so must be even */ 807 + int numa_node; 807 808 808 809 struct gve_registers __iomem *reg_bar0; /* see gve_register.h */ 809 810 __be32 __iomem *db_bar2; /* "array" of doorbells */

drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c

··· 246 246 .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV, 247 247 .order = 0, 248 248 .pool_size = GVE_PAGE_POOL_SIZE_MULTIPLIER * priv->rx_desc_cnt, 249 + .nid = priv->numa_node, 249 250 .dev = &priv->pdev->dev, 250 251 .netdev = priv->dev, 251 252 .napi = &priv->ntfy_blocks[ntfy_id].napi,

+24 -6

drivers/net/ethernet/google/gve/gve_main.c

··· 461 461 return work_done; 462 462 } 463 463 464 + static const struct cpumask *gve_get_node_mask(struct gve_priv *priv) 465 + { 466 + if (priv->numa_node == NUMA_NO_NODE) 467 + return cpu_all_mask; 468 + else 469 + return cpumask_of_node(priv->numa_node); 470 + } 471 + 464 472 static int gve_alloc_notify_blocks(struct gve_priv *priv) 465 473 { 466 474 int num_vecs_requested = priv->num_ntfy_blks + 1; 467 - unsigned int active_cpus; 475 + const struct cpumask *node_mask; 476 + unsigned int cur_cpu; 468 477 int vecs_enabled; 469 478 int i, j; 470 479 int err; ··· 512 503 if (priv->rx_cfg.num_queues > priv->rx_cfg.max_queues) 513 504 priv->rx_cfg.num_queues = priv->rx_cfg.max_queues; 514 505 } 515 - /* Half the notification blocks go to TX and half to RX */ 516 - active_cpus = min_t(int, priv->num_ntfy_blks / 2, num_online_cpus()); 517 506 518 507 /* Setup Management Vector - the last vector */ 519 508 snprintf(priv->mgmt_msix_name, sizeof(priv->mgmt_msix_name), "gve-mgmnt@pci:%s", ··· 540 533 } 541 534 542 535 /* Setup the other blocks - the first n-1 vectors */ 536 + node_mask = gve_get_node_mask(priv); 537 + cur_cpu = cpumask_first(node_mask); 543 538 for (i = 0; i < priv->num_ntfy_blks; i++) { 544 539 struct gve_notify_block *block = &priv->ntfy_blocks[i]; 545 540 int msix_idx = i; ··· 558 549 goto abort_with_some_ntfy_blocks; 559 550 } 560 551 block->irq = priv->msix_vectors[msix_idx].vector; 561 - irq_set_affinity_hint(priv->msix_vectors[msix_idx].vector, 562 - get_cpu_mask(i % active_cpus)); 552 + irq_set_affinity_and_hint(block->irq, 553 + cpumask_of(cur_cpu)); 563 554 block->irq_db_index = &priv->irq_db_indices[i].index; 555 + 556 + cur_cpu = cpumask_next(cur_cpu, node_mask); 557 + /* Wrap once CPUs in the node have been exhausted, or when 558 + * starting RX queue affinities. TX and RX queues of the same 559 + * index share affinity. 560 + */ 561 + if (cur_cpu >= nr_cpu_ids || (i + 1) == priv->tx_cfg.max_queues) 562 + cur_cpu = cpumask_first(node_mask); 564 563 } 565 564 return 0; 566 565 abort_with_some_ntfy_blocks: ··· 1057 1040 struct page **page, dma_addr_t *dma, 1058 1041 enum dma_data_direction dir, gfp_t gfp_flags) 1059 1042 { 1060 - *page = alloc_page(gfp_flags); 1043 + *page = alloc_pages_node(priv->numa_node, gfp_flags, 0); 1061 1044 if (!*page) { 1062 1045 priv->page_alloc_fail++; 1063 1046 return -ENOMEM; ··· 2339 2322 */ 2340 2323 priv->num_ntfy_blks = (num_ntfy - 1) & ~0x1; 2341 2324 priv->mgmt_msix_idx = priv->num_ntfy_blks; 2325 + priv->numa_node = dev_to_node(&priv->pdev->dev); 2342 2326 2343 2327 priv->tx_cfg.max_queues = 2344 2328 min_t(int, priv->tx_cfg.max_queues, priv->num_ntfy_blks / 2);

+7 -7

drivers/net/ethernet/google/gve/gve_rx.c

··· 192 192 */ 193 193 slots = rx->mask + 1; 194 194 195 - rx->data.page_info = kvzalloc(slots * 196 - sizeof(*rx->data.page_info), GFP_KERNEL); 195 + rx->data.page_info = kvcalloc_node(slots, sizeof(*rx->data.page_info), 196 + GFP_KERNEL, priv->numa_node); 197 197 if (!rx->data.page_info) 198 198 return -ENOMEM; 199 199 ··· 216 216 217 217 if (!rx->data.raw_addressing) { 218 218 for (j = 0; j < rx->qpl_copy_pool_mask + 1; j++) { 219 - struct page *page = alloc_page(GFP_KERNEL); 219 + struct page *page = alloc_pages_node(priv->numa_node, 220 + GFP_KERNEL, 0); 220 221 221 222 if (!page) { 222 223 err = -ENOMEM; ··· 304 303 305 304 rx->qpl_copy_pool_mask = min_t(u32, U32_MAX, slots * 2) - 1; 306 305 rx->qpl_copy_pool_head = 0; 307 - rx->qpl_copy_pool = kvcalloc(rx->qpl_copy_pool_mask + 1, 308 - sizeof(rx->qpl_copy_pool[0]), 309 - GFP_KERNEL); 310 - 306 + rx->qpl_copy_pool = kvcalloc_node(rx->qpl_copy_pool_mask + 1, 307 + sizeof(rx->qpl_copy_pool[0]), 308 + GFP_KERNEL, priv->numa_node); 311 309 if (!rx->qpl_copy_pool) { 312 310 err = -ENOMEM; 313 311 goto abort_with_slots;

+4 -4

drivers/net/ethernet/google/gve/gve_rx_dqo.c

··· 237 237 238 238 rx->dqo.num_buf_states = cfg->raw_addressing ? buffer_queue_slots : 239 239 gve_get_rx_pages_per_qpl_dqo(cfg->ring_size); 240 - rx->dqo.buf_states = kvcalloc(rx->dqo.num_buf_states, 241 - sizeof(rx->dqo.buf_states[0]), 242 - GFP_KERNEL); 240 + rx->dqo.buf_states = kvcalloc_node(rx->dqo.num_buf_states, 241 + sizeof(rx->dqo.buf_states[0]), 242 + GFP_KERNEL, priv->numa_node); 243 243 if (!rx->dqo.buf_states) 244 244 return -ENOMEM; 245 245 ··· 488 488 struct gve_rx_buf_state_dqo *buf_state, 489 489 u16 buf_len) 490 490 { 491 - struct page *page = alloc_page(GFP_ATOMIC); 491 + struct page *page = alloc_pages_node(rx->gve->numa_node, GFP_ATOMIC, 0); 492 492 int num_frags; 493 493 494 494 if (!page)

Configure Feed

Configure Feed