Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'introduce-page_pool_alloc-related-api'

Yunsheng Lin says:

====================
introduce page_pool_alloc() related API

In [1] & [2] & [3], there are usecases for veth and virtio_net
to use frag support in page pool to reduce memory usage, and it
may request different frag size depending on the head/tail
room space for xdp_frame/shinfo and mtu/packet size. When the
requested frag size is large enough that a single page can not
be split into more than one frag, using frag support only have
performance penalty because of the extra frag count handling
for frag support.

So this patchset provides a page pool API for the driver to
allocate memory with least memory utilization and performance
penalty when it doesn't know the size of memory it need
beforehand.

1. https://patchwork.kernel.org/project/netdevbpf/patch/d3ae6bd3537fbce379382ac6a42f67e22f27ece2.1683896626.git.lorenzo@kernel.org/
2. https://patchwork.kernel.org/project/netdevbpf/patch/20230526054621.18371-3-liangchen.linux@gmail.com/
3. https://github.com/alobakin/linux/tree/iavf-pp-frag
====================

Link: https://lore.kernel.org/r/20231020095952.11055-1-linyunsheng@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+218 -56
+3 -1
Documentation/networking/page_pool.rst
··· 58 58 59 59 .. kernel-doc:: include/net/page_pool/helpers.h 60 60 :identifiers: page_pool_put_page page_pool_put_full_page 61 - page_pool_recycle_direct page_pool_dev_alloc_pages 61 + page_pool_recycle_direct page_pool_free_va 62 + page_pool_dev_alloc_pages page_pool_dev_alloc_frag 63 + page_pool_dev_alloc page_pool_dev_alloc_va 62 64 page_pool_get_dma_addr page_pool_get_dma_dir 63 65 64 66 .. kernel-doc:: net/core/page_pool.c
-2
drivers/net/ethernet/broadcom/bnxt/bnxt.c
··· 3302 3302 pp.dma_dir = bp->rx_dir; 3303 3303 pp.max_len = PAGE_SIZE; 3304 3304 pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; 3305 - if (PAGE_SIZE > BNXT_RX_PAGE_SIZE) 3306 - pp.flags |= PP_FLAG_PAGE_FRAG; 3307 3305 3308 3306 rxr->page_pool = page_pool_create(&pp); 3309 3307 if (IS_ERR(rxr->page_pool)) {
+1 -2
drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
··· 4940 4940 static void hns3_alloc_page_pool(struct hns3_enet_ring *ring) 4941 4941 { 4942 4942 struct page_pool_params pp_params = { 4943 - .flags = PP_FLAG_DMA_MAP | PP_FLAG_PAGE_FRAG | 4944 - PP_FLAG_DMA_SYNC_DEV, 4943 + .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV, 4945 4944 .order = hns3_page_order(ring), 4946 4945 .pool_size = ring->desc_num * hns3_buf_size(ring) / 4947 4946 (PAGE_SIZE << hns3_page_order(ring)),
-3
drivers/net/ethernet/intel/idpf/idpf_txrx.c
··· 595 595 .offset = 0, 596 596 }; 597 597 598 - if (rxbufq->rx_buf_size == IDPF_RX_BUF_2048) 599 - pp.flags |= PP_FLAG_PAGE_FRAG; 600 - 601 598 return page_pool_create(&pp); 602 599 } 603 600
+1 -1
drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
··· 1404 1404 } 1405 1405 1406 1406 pp_params.order = get_order(buf_size); 1407 - pp_params.flags = PP_FLAG_PAGE_FRAG | PP_FLAG_DMA_MAP; 1407 + pp_params.flags = PP_FLAG_DMA_MAP; 1408 1408 pp_params.pool_size = min(OTX2_PAGE_POOL_SZ, numptrs); 1409 1409 pp_params.nid = NUMA_NO_NODE; 1410 1410 pp_params.dev = pfvf->dev;
+1 -1
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
··· 897 897 struct page_pool_params pp_params = { 0 }; 898 898 899 899 pp_params.order = 0; 900 - pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | PP_FLAG_PAGE_FRAG; 900 + pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; 901 901 pp_params.pool_size = pool_size; 902 902 pp_params.nid = node; 903 903 pp_params.dev = rq->pdev;
+16 -9
drivers/net/veth.c
··· 737 737 if (skb_shared(skb) || skb_head_is_locked(skb) || 738 738 skb_shinfo(skb)->nr_frags || 739 739 skb_headroom(skb) < XDP_PACKET_HEADROOM) { 740 - u32 size, len, max_head_size, off; 740 + u32 size, len, max_head_size, off, truesize, page_offset; 741 741 struct sk_buff *nskb; 742 742 struct page *page; 743 743 int i, head_off; 744 + void *va; 744 745 745 746 /* We need a private copy of the skb and data buffers since 746 747 * the ebpf program can modify it. We segment the original skb ··· 754 753 if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size) 755 754 goto drop; 756 755 756 + size = min_t(u32, skb->len, max_head_size); 757 + truesize = SKB_HEAD_ALIGN(size) + VETH_XDP_HEADROOM; 758 + 757 759 /* Allocate skb head */ 758 - page = page_pool_dev_alloc_pages(rq->page_pool); 759 - if (!page) 760 + va = page_pool_dev_alloc_va(rq->page_pool, &truesize); 761 + if (!va) 760 762 goto drop; 761 763 762 - nskb = napi_build_skb(page_address(page), PAGE_SIZE); 764 + nskb = napi_build_skb(va, truesize); 763 765 if (!nskb) { 764 - page_pool_put_full_page(rq->page_pool, page, true); 766 + page_pool_free_va(rq->page_pool, va, true); 765 767 goto drop; 766 768 } 767 769 ··· 772 768 skb_copy_header(nskb, skb); 773 769 skb_mark_for_recycle(nskb); 774 770 775 - size = min_t(u32, skb->len, max_head_size); 776 771 if (skb_copy_bits(skb, 0, nskb->data, size)) { 777 772 consume_skb(nskb); 778 773 goto drop; ··· 786 783 len = skb->len - off; 787 784 788 785 for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { 789 - page = page_pool_dev_alloc_pages(rq->page_pool); 786 + size = min_t(u32, len, PAGE_SIZE); 787 + truesize = size; 788 + 789 + page = page_pool_dev_alloc(rq->page_pool, &page_offset, 790 + &truesize); 790 791 if (!page) { 791 792 consume_skb(nskb); 792 793 goto drop; 793 794 } 794 795 795 - size = min_t(u32, len, PAGE_SIZE); 796 - skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE); 796 + skb_add_rx_frag(nskb, i, page, page_offset, size, 797 + truesize); 797 798 if (skb_copy_bits(skb, off, page_address(page), 798 799 size)) { 799 800 consume_skb(nskb);
+1 -1
drivers/net/wireless/mediatek/mt76/mac80211.c
··· 570 570 { 571 571 struct page_pool_params pp_params = { 572 572 .order = 0, 573 - .flags = PP_FLAG_PAGE_FRAG, 573 + .flags = 0, 574 574 .nid = NUMA_NO_NODE, 575 575 .dev = dev->dma_dev, 576 576 };
+180 -26
include/net/page_pool/helpers.h
··· 8 8 /** 9 9 * DOC: page_pool allocator 10 10 * 11 - * The page_pool allocator is optimized for the XDP mode that 12 - * uses one frame per-page, but it can fallback on the 13 - * regular page allocator APIs. 11 + * The page_pool allocator is optimized for recycling page or page fragment used 12 + * by skb packet and xdp frame. 14 13 * 15 - * Basic use involves replacing alloc_pages() calls with the 16 - * page_pool_alloc_pages() call. Drivers should use 17 - * page_pool_dev_alloc_pages() replacing dev_alloc_pages(). 14 + * Basic use involves replacing and alloc_pages() calls with page_pool_alloc(), 15 + * which allocate memory with or without page splitting depending on the 16 + * requested memory size. 18 17 * 19 - * The API keeps track of in-flight pages, in order to let API users know 20 - * when it is safe to free a page_pool object. Thus, API users 21 - * must call page_pool_put_page() to free the page, or attach 22 - * the page to a page_pool-aware object like skbs marked with 18 + * If the driver knows that it always requires full pages or its allocations are 19 + * always smaller than half a page, it can use one of the more specific API 20 + * calls: 21 + * 22 + * 1. page_pool_alloc_pages(): allocate memory without page splitting when 23 + * driver knows that the memory it need is always bigger than half of the page 24 + * allocated from page pool. There is no cache line dirtying for 'struct page' 25 + * when a page is recycled back to the page pool. 26 + * 27 + * 2. page_pool_alloc_frag(): allocate memory with page splitting when driver 28 + * knows that the memory it need is always smaller than or equal to half of the 29 + * page allocated from page pool. Page splitting enables memory saving and thus 30 + * avoids TLB/cache miss for data access, but there also is some cost to 31 + * implement page splitting, mainly some cache line dirtying/bouncing for 32 + * 'struct page' and atomic operation for page->pp_frag_count. 33 + * 34 + * The API keeps track of in-flight pages, in order to let API users know when 35 + * it is safe to free a page_pool object, the API users must call 36 + * page_pool_put_page() or page_pool_free_va() to free the page_pool object, or 37 + * attach the page_pool object to a page_pool-aware object like skbs marked with 23 38 * skb_mark_for_recycle(). 24 39 * 25 - * API users must call page_pool_put_page() once on a page, as it 26 - * will either recycle the page, or in case of refcnt > 1, it will 27 - * release the DMA mapping and in-flight state accounting. 40 + * page_pool_put_page() may be called multi times on the same page if a page is 41 + * split into multi fragments. For the last fragment, it will either recycle the 42 + * page, or in case of page->_refcount > 1, it will release the DMA mapping and 43 + * in-flight state accounting. 44 + * 45 + * dma_sync_single_range_for_device() is only called for the last fragment when 46 + * page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the 47 + * last freed fragment to do the sync_for_device operation for all fragments in 48 + * the same page when a page is split, the API user must setup pool->p.max_len 49 + * and pool->p.offset correctly and ensure that page_pool_put_page() is called 50 + * with dma_sync_size being -1 for fragment API. 28 51 */ 29 52 #ifndef _NET_PAGE_POOL_HELPERS_H 30 53 #define _NET_PAGE_POOL_HELPERS_H ··· 96 73 return page_pool_alloc_pages(pool, gfp); 97 74 } 98 75 76 + /** 77 + * page_pool_dev_alloc_frag() - allocate a page fragment. 78 + * @pool: pool from which to allocate 79 + * @offset: offset to the allocated page 80 + * @size: requested size 81 + * 82 + * Get a page fragment from the page allocator or page_pool caches. 83 + * 84 + * Return: 85 + * Return allocated page fragment, otherwise return NULL. 86 + */ 99 87 static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, 100 88 unsigned int *offset, 101 89 unsigned int size) ··· 114 80 gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); 115 81 116 82 return page_pool_alloc_frag(pool, offset, size, gfp); 83 + } 84 + 85 + static inline struct page *page_pool_alloc(struct page_pool *pool, 86 + unsigned int *offset, 87 + unsigned int *size, gfp_t gfp) 88 + { 89 + unsigned int max_size = PAGE_SIZE << pool->p.order; 90 + struct page *page; 91 + 92 + if ((*size << 1) > max_size) { 93 + *size = max_size; 94 + *offset = 0; 95 + return page_pool_alloc_pages(pool, gfp); 96 + } 97 + 98 + page = page_pool_alloc_frag(pool, offset, *size, gfp); 99 + if (unlikely(!page)) 100 + return NULL; 101 + 102 + /* There is very likely not enough space for another fragment, so append 103 + * the remaining size to the current fragment to avoid truesize 104 + * underestimate problem. 105 + */ 106 + if (pool->frag_offset + *size > max_size) { 107 + *size = max_size - *offset; 108 + pool->frag_offset = max_size; 109 + } 110 + 111 + return page; 112 + } 113 + 114 + /** 115 + * page_pool_dev_alloc() - allocate a page or a page fragment. 116 + * @pool: pool from which to allocate 117 + * @offset: offset to the allocated page 118 + * @size: in as the requested size, out as the allocated size 119 + * 120 + * Get a page or a page fragment from the page allocator or page_pool caches 121 + * depending on the requested size in order to allocate memory with least memory 122 + * utilization and performance penalty. 123 + * 124 + * Return: 125 + * Return allocated page or page fragment, otherwise return NULL. 126 + */ 127 + static inline struct page *page_pool_dev_alloc(struct page_pool *pool, 128 + unsigned int *offset, 129 + unsigned int *size) 130 + { 131 + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); 132 + 133 + return page_pool_alloc(pool, offset, size, gfp); 134 + } 135 + 136 + static inline void *page_pool_alloc_va(struct page_pool *pool, 137 + unsigned int *size, gfp_t gfp) 138 + { 139 + unsigned int offset; 140 + struct page *page; 141 + 142 + /* Mask off __GFP_HIGHMEM to ensure we can use page_address() */ 143 + page = page_pool_alloc(pool, &offset, size, gfp & ~__GFP_HIGHMEM); 144 + if (unlikely(!page)) 145 + return NULL; 146 + 147 + return page_address(page) + offset; 148 + } 149 + 150 + /** 151 + * page_pool_dev_alloc_va() - allocate a page or a page fragment and return its 152 + * va. 153 + * @pool: pool from which to allocate 154 + * @size: in as the requested size, out as the allocated size 155 + * 156 + * This is just a thin wrapper around the page_pool_alloc() API, and 157 + * it returns va of the allocated page or page fragment. 158 + * 159 + * Return: 160 + * Return the va for the allocated page or page fragment, otherwise return NULL. 161 + */ 162 + static inline void *page_pool_dev_alloc_va(struct page_pool *pool, 163 + unsigned int *size) 164 + { 165 + gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); 166 + 167 + return page_pool_alloc_va(pool, size, gfp); 117 168 } 118 169 119 170 /** ··· 234 115 long ret; 235 116 236 117 /* If nr == pp_frag_count then we have cleared all remaining 237 - * references to the page. No need to actually overwrite it, instead 238 - * we can leave this to be overwritten by the calling function. 118 + * references to the page: 119 + * 1. 'n == 1': no need to actually overwrite it. 120 + * 2. 'n != 1': overwrite it with one, which is the rare case 121 + * for pp_frag_count draining. 239 122 * 240 - * The main advantage to doing this is that an atomic_read is 241 - * generally a much cheaper operation than an atomic update, 242 - * especially when dealing with a page that may be partitioned 243 - * into only 2 or 3 pieces. 123 + * The main advantage to doing this is that not only we avoid a atomic 124 + * update, as an atomic_read is generally a much cheaper operation than 125 + * an atomic update, especially when dealing with a page that may be 126 + * partitioned into only 2 or 3 pieces; but also unify the pp_frag_count 127 + * handling by ensuring all pages have partitioned into only 1 piece 128 + * initially, and only overwrite it when the page is partitioned into 129 + * more than one piece. 244 130 */ 245 - if (atomic_long_read(&page->pp_frag_count) == nr) 131 + if (atomic_long_read(&page->pp_frag_count) == nr) { 132 + /* As we have ensured nr is always one for constant case using 133 + * the BUILD_BUG_ON(), only need to handle the non-constant case 134 + * here for pp_frag_count draining, which is a rare case. 135 + */ 136 + BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1); 137 + if (!__builtin_constant_p(nr)) 138 + atomic_long_set(&page->pp_frag_count, 1); 139 + 246 140 return 0; 141 + } 247 142 248 143 ret = atomic_long_sub_return(nr, &page->pp_frag_count); 249 144 WARN_ON(ret < 0); 145 + 146 + /* We are the last user here too, reset pp_frag_count back to 1 to 147 + * ensure all pages have been partitioned into 1 piece initially, 148 + * this should be the rare case when the last two fragment users call 149 + * page_pool_defrag_page() currently. 150 + */ 151 + if (unlikely(!ret)) 152 + atomic_long_set(&page->pp_frag_count, 1); 153 + 250 154 return ret; 251 155 } 252 156 253 - static inline bool page_pool_is_last_frag(struct page_pool *pool, 254 - struct page *page) 157 + static inline bool page_pool_is_last_frag(struct page *page) 255 158 { 256 - /* If fragments aren't enabled or count is 0 we were the last user */ 257 - return !(pool->p.flags & PP_FLAG_PAGE_FRAG) || 258 - (page_pool_defrag_page(page, 1) == 0); 159 + /* If page_pool_defrag_page() returns 0, we were the last user */ 160 + return page_pool_defrag_page(page, 1) == 0; 259 161 } 260 162 261 163 /** ··· 301 161 * allow registering MEM_TYPE_PAGE_POOL, but shield linker. 302 162 */ 303 163 #ifdef CONFIG_PAGE_POOL 304 - if (!page_pool_is_last_frag(pool, page)) 164 + if (!page_pool_is_last_frag(page)) 305 165 return; 306 166 307 167 page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct); ··· 339 199 340 200 #define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA \ 341 201 (sizeof(dma_addr_t) > sizeof(unsigned long)) 202 + 203 + /** 204 + * page_pool_free_va() - free a va into the page_pool 205 + * @pool: pool from which va was allocated 206 + * @va: va to be freed 207 + * @allow_direct: freed by the consumer, allow lockless caching 208 + * 209 + * Free a va allocated from page_pool_allo_va(). 210 + */ 211 + static inline void page_pool_free_va(struct page_pool *pool, void *va, 212 + bool allow_direct) 213 + { 214 + page_pool_put_page(pool, virt_to_head_page(va), -1, allow_direct); 215 + } 342 216 343 217 /** 344 218 * page_pool_get_dma_addr() - Retrieve the stored DMA address.
+2 -4
include/net/page_pool/types.h
··· 17 17 * Please note DMA-sync-for-CPU is still 18 18 * device driver responsibility 19 19 */ 20 - #define PP_FLAG_PAGE_FRAG BIT(2) /* for page frag feature */ 21 20 #define PP_FLAG_ALL (PP_FLAG_DMA_MAP |\ 22 - PP_FLAG_DMA_SYNC_DEV |\ 23 - PP_FLAG_PAGE_FRAG) 21 + PP_FLAG_DMA_SYNC_DEV) 24 22 25 23 /* 26 24 * Fast allocation side cache array/stack ··· 43 45 44 46 /** 45 47 * struct page_pool_params - page pool parameters 46 - * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_PAGE_FRAG 48 + * @flags: PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV 47 49 * @order: 2^order pages on allocation 48 50 * @pool_size: size of the ptr_ring 49 51 * @nid: NUMA node id to allocate from pages from
+12 -5
net/core/page_pool.c
··· 376 376 { 377 377 page->pp = pool; 378 378 page->pp_magic |= PP_SIGNATURE; 379 + 380 + /* Ensuring all pages have been split into one fragment initially: 381 + * page_pool_set_pp_info() is only called once for every page when it 382 + * is allocated from the page allocator and page_pool_fragment_page() 383 + * is dirtying the same cache line as the page->pp_magic above, so 384 + * the overhead is negligible. 385 + */ 386 + page_pool_fragment_page(page, 1); 379 387 if (pool->p.init_callback) 380 388 pool->p.init_callback(page, pool->p.init_arg); 381 389 } ··· 680 672 struct page *page = virt_to_head_page(data[i]); 681 673 682 674 /* It is not the last user for the page frag case */ 683 - if (!page_pool_is_last_frag(pool, page)) 675 + if (!page_pool_is_last_frag(page)) 684 676 continue; 685 677 686 678 page = __page_pool_put_page(pool, page, -1, false); ··· 756 748 unsigned int max_size = PAGE_SIZE << pool->p.order; 757 749 struct page *page = pool->frag_page; 758 750 759 - if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) || 760 - size > max_size)) 751 + if (WARN_ON(size > max_size)) 761 752 return NULL; 762 753 763 754 size = ALIGN(size, dma_get_cache_alignment()); ··· 809 802 } 810 803 } 811 804 812 - static void page_pool_free(struct page_pool *pool) 805 + static void __page_pool_destroy(struct page_pool *pool) 813 806 { 814 807 if (pool->disconnect) 815 808 pool->disconnect(pool); ··· 860 853 page_pool_scrub(pool); 861 854 inflight = page_pool_inflight(pool); 862 855 if (!inflight) 863 - page_pool_free(pool); 856 + __page_pool_destroy(pool); 864 857 865 858 return inflight; 866 859 }
+1 -1
net/core/skbuff.c
··· 5765 5765 /* In general, avoid mixing page_pool and non-page_pool allocated 5766 5766 * pages within the same SKB. Additionally avoid dealing with clones 5767 5767 * with page_pool pages, in case the SKB is using page_pool fragment 5768 - * references (PP_FLAG_PAGE_FRAG). Since we only take full page 5768 + * references (page_pool_alloc_frag()). Since we only take full page 5769 5769 * references for cloned SKBs at the moment that would result in 5770 5770 * inconsistent reference counts. 5771 5771 * In theory we could take full references if @from is cloned and