Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'mlxsw-use-page-pool-for-rx-buffers-allocation'

Petr Machata says:

====================
mlxsw: Use page pool for Rx buffers allocation

Amit Cohen writes:

After using NAPI to process events from hardware, the next step is to
use page pool for Rx buffers allocation, which is also enhances
performance.

To simplify this change, first use page pool to allocate one continuous
buffer for each packet, later memory consumption can be improved by using
fragmented buffers.

This set significantly enhances mlxsw driver performance, CPU can handle
about 370% of the packets per second it previously handled.

The next planned improvement is using XDP to optimize telemetry.

Patch set overview:
Patches #1-#2 are small preparations for page pool usage
Patch #3 initializes page pool, but do not use it
Patch #4 converts the driver to use page pool for buffers allocations
Patch #5 is an optimization for buffer access
Patch #6 cleans up an unused structure
Patch #7 uses napi_consume_skb() as part of Tx completion
====================

Link: https://lore.kernel.org/r/cover.1718709196.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+144 -60
+1
drivers/net/ethernet/mellanox/mlxsw/Kconfig
··· 33 33 config MLXSW_PCI 34 34 tristate "PCI bus implementation for Mellanox Technologies Switch ASICs" 35 35 depends on PCI && HAS_IOMEM && MLXSW_CORE 36 + select PAGE_POOL 36 37 default m 37 38 help 38 39 This is PCI bus implementation for Mellanox Technologies Switch ASICs.
+143 -60
drivers/net/ethernet/mellanox/mlxsw/pci.c
··· 13 13 #include <linux/if_vlan.h> 14 14 #include <linux/log2.h> 15 15 #include <linux/string.h> 16 + #include <net/page_pool/helpers.h> 16 17 17 18 #include "pci_hw.h" 18 19 #include "pci.h" ··· 62 61 }; 63 62 64 63 struct mlxsw_pci_queue_elem_info { 64 + struct page *page; 65 65 char *elem; /* pointer to actual dma mapped element mem chunk */ 66 - union { 67 - struct { 68 - struct sk_buff *skb; 69 - } sdq; 70 - struct { 71 - struct sk_buff *skb; 72 - } rdq; 73 - } u; 66 + struct { 67 + struct sk_buff *skb; 68 + } sdq; 74 69 }; 75 70 76 71 struct mlxsw_pci_queue { ··· 85 88 enum mlxsw_pci_cqe_v v; 86 89 struct mlxsw_pci_queue *dq; 87 90 struct napi_struct napi; 91 + struct page_pool *page_pool; 88 92 } cq; 89 93 struct { 90 94 struct tasklet_struct tasklet; 91 95 } eq; 96 + struct { 97 + struct mlxsw_pci_queue *cq; 98 + } rdq; 92 99 } u; 93 100 }; 94 101 ··· 336 335 mlxsw_cmd_hw2sw_sdq(mlxsw_pci->core, q->num); 337 336 } 338 337 338 + #define MLXSW_PCI_SKB_HEADROOM (NET_SKB_PAD + NET_IP_ALIGN) 339 + 340 + #define MLXSW_PCI_RX_BUF_SW_OVERHEAD \ 341 + (MLXSW_PCI_SKB_HEADROOM + \ 342 + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) 343 + 344 + static void 345 + mlxsw_pci_wqe_rx_frag_set(struct mlxsw_pci *mlxsw_pci, struct page *page, 346 + char *wqe, int index, size_t frag_len) 347 + { 348 + dma_addr_t mapaddr; 349 + 350 + mapaddr = page_pool_get_dma_addr(page); 351 + mapaddr += MLXSW_PCI_SKB_HEADROOM; 352 + 353 + mlxsw_pci_wqe_address_set(wqe, index, mapaddr); 354 + mlxsw_pci_wqe_byte_count_set(wqe, index, frag_len); 355 + } 356 + 339 357 static int mlxsw_pci_wqe_frag_map(struct mlxsw_pci *mlxsw_pci, char *wqe, 340 358 int index, char *frag_data, size_t frag_len, 341 359 int direction) ··· 384 364 dma_unmap_single(&pdev->dev, mapaddr, frag_len, direction); 385 365 } 386 366 387 - static int mlxsw_pci_rdq_skb_alloc(struct mlxsw_pci *mlxsw_pci, 388 - struct mlxsw_pci_queue_elem_info *elem_info, 389 - gfp_t gfp) 367 + static struct sk_buff *mlxsw_pci_rdq_build_skb(struct page *page, 368 + u16 byte_count) 390 369 { 391 - size_t buf_len = MLXSW_PORT_MAX_MTU; 392 - char *wqe = elem_info->elem; 370 + void *data = page_address(page); 371 + unsigned int allocated_size; 393 372 struct sk_buff *skb; 394 - int err; 395 373 396 - skb = __netdev_alloc_skb_ip_align(NULL, buf_len, gfp); 397 - if (!skb) 398 - return -ENOMEM; 374 + net_prefetch(data); 375 + allocated_size = page_size(page); 376 + skb = napi_build_skb(data, allocated_size); 377 + if (unlikely(!skb)) 378 + return ERR_PTR(-ENOMEM); 399 379 400 - err = mlxsw_pci_wqe_frag_map(mlxsw_pci, wqe, 0, skb->data, 401 - buf_len, DMA_FROM_DEVICE); 402 - if (err) 403 - goto err_frag_map; 404 - 405 - elem_info->u.rdq.skb = skb; 406 - return 0; 407 - 408 - err_frag_map: 409 - dev_kfree_skb_any(skb); 410 - return err; 380 + skb_reserve(skb, MLXSW_PCI_SKB_HEADROOM); 381 + skb_put(skb, byte_count); 382 + return skb; 411 383 } 412 384 413 - static void mlxsw_pci_rdq_skb_free(struct mlxsw_pci *mlxsw_pci, 414 - struct mlxsw_pci_queue_elem_info *elem_info) 385 + static int mlxsw_pci_rdq_page_alloc(struct mlxsw_pci_queue *q, 386 + struct mlxsw_pci_queue_elem_info *elem_info) 415 387 { 416 - struct sk_buff *skb; 417 - char *wqe; 388 + struct mlxsw_pci_queue *cq = q->u.rdq.cq; 389 + size_t buf_len = MLXSW_PORT_MAX_MTU; 390 + char *wqe = elem_info->elem; 391 + struct page *page; 418 392 419 - skb = elem_info->u.rdq.skb; 420 - wqe = elem_info->elem; 393 + page = page_pool_dev_alloc_pages(cq->u.cq.page_pool); 394 + if (unlikely(!page)) 395 + return -ENOMEM; 421 396 422 - mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE); 423 - dev_kfree_skb_any(skb); 397 + mlxsw_pci_wqe_rx_frag_set(q->pci, page, wqe, 0, buf_len); 398 + elem_info->page = page; 399 + return 0; 400 + } 401 + 402 + static void mlxsw_pci_rdq_page_free(struct mlxsw_pci_queue *q, 403 + struct mlxsw_pci_queue_elem_info *elem_info) 404 + { 405 + struct mlxsw_pci_queue *cq = q->u.rdq.cq; 406 + 407 + page_pool_put_page(cq->u.cq.page_pool, elem_info->page, -1, false); 424 408 } 425 409 426 410 static int mlxsw_pci_rdq_init(struct mlxsw_pci *mlxsw_pci, char *mbox, ··· 458 434 459 435 cq = mlxsw_pci_cq_get(mlxsw_pci, cq_num); 460 436 cq->u.cq.dq = q; 437 + q->u.rdq.cq = cq; 461 438 462 439 mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, q); 463 440 464 441 for (i = 0; i < q->count; i++) { 465 442 elem_info = mlxsw_pci_queue_elem_info_producer_get(q); 466 443 BUG_ON(!elem_info); 467 - err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info, GFP_KERNEL); 444 + err = mlxsw_pci_rdq_page_alloc(q, elem_info); 468 445 if (err) 469 446 goto rollback; 470 447 /* Everything is set up, ring doorbell to pass elem to HW */ ··· 478 453 rollback: 479 454 for (i--; i >= 0; i--) { 480 455 elem_info = mlxsw_pci_queue_elem_info_get(q, i); 481 - mlxsw_pci_rdq_skb_free(mlxsw_pci, elem_info); 456 + mlxsw_pci_rdq_page_free(q, elem_info); 482 457 } 458 + q->u.rdq.cq = NULL; 483 459 cq->u.cq.dq = NULL; 484 460 mlxsw_cmd_hw2sw_rdq(mlxsw_pci->core, q->num); 485 461 ··· 496 470 mlxsw_cmd_hw2sw_rdq(mlxsw_pci->core, q->num); 497 471 for (i = 0; i < q->count; i++) { 498 472 elem_info = mlxsw_pci_queue_elem_info_get(q, i); 499 - mlxsw_pci_rdq_skb_free(mlxsw_pci, elem_info); 473 + mlxsw_pci_rdq_page_free(q, elem_info); 500 474 } 501 475 } 502 476 ··· 541 515 struct mlxsw_pci_queue *q, 542 516 u16 consumer_counter_limit, 543 517 enum mlxsw_pci_cqe_v cqe_v, 544 - char *cqe) 518 + char *cqe, int budget) 545 519 { 546 520 struct pci_dev *pdev = mlxsw_pci->pdev; 547 521 struct mlxsw_pci_queue_elem_info *elem_info; ··· 552 526 553 527 spin_lock(&q->lock); 554 528 elem_info = mlxsw_pci_queue_elem_info_consumer_get(q); 555 - tx_info = mlxsw_skb_cb(elem_info->u.sdq.skb)->tx_info; 556 - skb = elem_info->u.sdq.skb; 529 + tx_info = mlxsw_skb_cb(elem_info->sdq.skb)->tx_info; 530 + skb = elem_info->sdq.skb; 557 531 wqe = elem_info->elem; 558 532 for (i = 0; i < MLXSW_PCI_WQE_SG_ENTRIES; i++) 559 533 mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, i, DMA_TO_DEVICE); ··· 567 541 } 568 542 569 543 if (skb) 570 - dev_kfree_skb_any(skb); 571 - elem_info->u.sdq.skb = NULL; 544 + napi_consume_skb(skb, budget); 545 + elem_info->sdq.skb = NULL; 572 546 573 547 if (q->consumer_counter++ != consumer_counter_limit) 574 548 dev_dbg_ratelimited(&pdev->dev, "Consumer counter does not match limit in SDQ\n"); ··· 631 605 { 632 606 struct pci_dev *pdev = mlxsw_pci->pdev; 633 607 struct mlxsw_pci_queue_elem_info *elem_info; 608 + struct mlxsw_pci_queue *cq = q->u.rdq.cq; 634 609 struct mlxsw_rx_info rx_info = {}; 635 - char wqe[MLXSW_PCI_WQE_SIZE]; 636 610 struct sk_buff *skb; 611 + struct page *page; 637 612 u16 byte_count; 638 613 int err; 639 614 640 615 elem_info = mlxsw_pci_queue_elem_info_consumer_get(q); 641 - skb = elem_info->u.rdq.skb; 642 - memcpy(wqe, elem_info->elem, MLXSW_PCI_WQE_SIZE); 643 616 644 617 if (q->consumer_counter++ != consumer_counter_limit) 645 618 dev_dbg_ratelimited(&pdev->dev, "Consumer counter does not match limit in RDQ\n"); 646 619 647 - err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info, GFP_ATOMIC); 620 + byte_count = mlxsw_pci_cqe_byte_count_get(cqe); 621 + if (mlxsw_pci_cqe_crc_get(cqe_v, cqe)) 622 + byte_count -= ETH_FCS_LEN; 623 + 624 + page = elem_info->page; 625 + 626 + err = mlxsw_pci_rdq_page_alloc(q, elem_info); 648 627 if (err) { 649 - dev_err_ratelimited(&pdev->dev, "Failed to alloc skb for RDQ\n"); 628 + dev_err_ratelimited(&pdev->dev, "Failed to alloc page\n"); 650 629 goto out; 651 630 } 652 631 653 - mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE); 632 + skb = mlxsw_pci_rdq_build_skb(page, byte_count); 633 + if (IS_ERR(skb)) { 634 + dev_err_ratelimited(&pdev->dev, "Failed to build skb for RDQ\n"); 635 + page_pool_recycle_direct(cq->u.cq.page_pool, page); 636 + goto out; 637 + } 638 + 639 + skb_mark_for_recycle(skb); 654 640 655 641 if (mlxsw_pci_cqe_lag_get(cqe_v, cqe)) { 656 642 rx_info.is_lag = true; ··· 695 657 696 658 mlxsw_pci_skb_cb_ts_set(mlxsw_pci, skb, cqe_v, cqe); 697 659 698 - byte_count = mlxsw_pci_cqe_byte_count_get(cqe); 699 - if (mlxsw_pci_cqe_crc_get(cqe_v, cqe)) 700 - byte_count -= ETH_FCS_LEN; 701 - skb_put(skb, byte_count); 702 660 mlxsw_core_skb_receive(mlxsw_pci->core, skb, &rx_info); 703 661 704 662 out: ··· 819 785 mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q); 820 786 821 787 mlxsw_pci_cqe_sdq_handle(mlxsw_pci, sdq, 822 - wqe_counter, q->u.cq.v, ncqe); 788 + wqe_counter, q->u.cq.v, ncqe, budget); 823 789 824 790 work_done++; 825 791 } ··· 866 832 mlxsw_pci_napi_poll_cq_rx); 867 833 break; 868 834 } 869 - 870 - napi_enable(&q->u.cq.napi); 871 835 } 872 836 873 837 static void mlxsw_pci_cq_napi_teardown(struct mlxsw_pci_queue *q) 874 838 { 875 - napi_disable(&q->u.cq.napi); 876 839 netif_napi_del(&q->u.cq.napi); 840 + } 841 + 842 + static int mlxsw_pci_cq_page_pool_init(struct mlxsw_pci_queue *q, 843 + enum mlxsw_pci_cq_type cq_type) 844 + { 845 + struct page_pool_params pp_params = {}; 846 + struct mlxsw_pci *mlxsw_pci = q->pci; 847 + struct page_pool *page_pool; 848 + u32 max_pkt_size; 849 + 850 + if (cq_type != MLXSW_PCI_CQ_RDQ) 851 + return 0; 852 + 853 + max_pkt_size = MLXSW_PORT_MAX_MTU + MLXSW_PCI_RX_BUF_SW_OVERHEAD; 854 + pp_params.order = get_order(max_pkt_size); 855 + pp_params.flags = PP_FLAG_DMA_MAP; 856 + pp_params.pool_size = MLXSW_PCI_WQE_COUNT; 857 + pp_params.nid = dev_to_node(&mlxsw_pci->pdev->dev); 858 + pp_params.dev = &mlxsw_pci->pdev->dev; 859 + pp_params.napi = &q->u.cq.napi; 860 + pp_params.dma_dir = DMA_FROM_DEVICE; 861 + 862 + page_pool = page_pool_create(&pp_params); 863 + if (IS_ERR(page_pool)) 864 + return PTR_ERR(page_pool); 865 + 866 + q->u.cq.page_pool = page_pool; 867 + return 0; 868 + } 869 + 870 + static void mlxsw_pci_cq_page_pool_fini(struct mlxsw_pci_queue *q, 871 + enum mlxsw_pci_cq_type cq_type) 872 + { 873 + if (cq_type != MLXSW_PCI_CQ_RDQ) 874 + return; 875 + 876 + page_pool_destroy(q->u.cq.page_pool); 877 877 } 878 878 879 879 static int mlxsw_pci_cq_init(struct mlxsw_pci *mlxsw_pci, char *mbox, 880 880 struct mlxsw_pci_queue *q) 881 881 { 882 + enum mlxsw_pci_cq_type cq_type = mlxsw_pci_cq_type(mlxsw_pci, q); 882 883 int i; 883 884 int err; 884 885 ··· 943 874 err = mlxsw_cmd_sw2hw_cq(mlxsw_pci->core, mbox, q->num); 944 875 if (err) 945 876 return err; 946 - mlxsw_pci_cq_napi_setup(q, mlxsw_pci_cq_type(mlxsw_pci, q)); 877 + mlxsw_pci_cq_napi_setup(q, cq_type); 878 + 879 + err = mlxsw_pci_cq_page_pool_init(q, cq_type); 880 + if (err) 881 + goto err_page_pool_init; 882 + 883 + napi_enable(&q->u.cq.napi); 947 884 mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q); 948 885 mlxsw_pci_queue_doorbell_arm_consumer_ring(mlxsw_pci, q); 949 886 return 0; 887 + 888 + err_page_pool_init: 889 + mlxsw_pci_cq_napi_teardown(q); 890 + return err; 950 891 } 951 892 952 893 static void mlxsw_pci_cq_fini(struct mlxsw_pci *mlxsw_pci, 953 894 struct mlxsw_pci_queue *q) 954 895 { 896 + enum mlxsw_pci_cq_type cq_type = mlxsw_pci_cq_type(mlxsw_pci, q); 897 + 898 + napi_disable(&q->u.cq.napi); 899 + mlxsw_pci_cq_page_pool_fini(q, cq_type); 955 900 mlxsw_pci_cq_napi_teardown(q); 956 901 mlxsw_cmd_hw2sw_cq(mlxsw_pci->core, q->num); 957 902 } ··· 2002 1919 goto unlock; 2003 1920 } 2004 1921 mlxsw_skb_cb(skb)->tx_info = *tx_info; 2005 - elem_info->u.sdq.skb = skb; 1922 + elem_info->sdq.skb = skb; 2006 1923 2007 1924 wqe = elem_info->elem; 2008 1925 mlxsw_pci_wqe_c_set(wqe, 1); /* always report completion */