Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'fix-late-dma-unmap-crash-for-page-pool'

Toke Høiland-Jørgensen says:

====================
Fix late DMA unmap crash for page pool

This series fixes the late dma_unmap crash for page pool first reported
by Yonglong Liu in [0]. It is an alternative approach to the one
submitted by Yunsheng Lin, most recently in [1]. The first commit just
wraps some tests in a helper function, in preparation of the main change
in patch 2. See the commit message of patch 2 for the details.

[0] https://lore.kernel.org/8067f204-1380-4d37-8ffd-007fc6f26738@kernel.org
[1] https://lore.kernel.org/20250307092356.638242-1-linyunsheng@huawei.com

v8: https://lore.kernel.org/20250407-page-pool-track-dma-v8-0-da9500d4ba21@redhat.com
v7: https://lore.kernel.org/20250404-page-pool-track-dma-v7-0-ad34f069bc18@redhat.com
v6: https://lore.kernel.org/20250401-page-pool-track-dma-v6-0-8b83474870d4@redhat.com
v5: https://lore.kernel.org/20250328-page-pool-track-dma-v5-0-55002af683ad@redhat.com
v4: https://lore.kernel.org/20250327-page-pool-track-dma-v4-0-b380dc6706d0@redhat.com
v3: https://lore.kernel.org/20250326-page-pool-track-dma-v3-0-8e464016e0ac@redhat.com
v2: https://lore.kernel.org/20250325-page-pool-track-dma-v2-0-113ebc1946f3@redhat.com
v1: https://lore.kernel.org/20250314-page-pool-track-dma-v1-0-c212e57a74c2@redhat.com
====================

Link: https://patch.msgid.link/20250409-page-pool-track-dma-v9-0-6a9ef2e0cba8@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+176 -38
+2 -2
drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
··· 707 707 xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo); 708 708 page = xdpi.page.page; 709 709 710 - /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) 711 - * as we know this is a page_pool page. 710 + /* No need to check page_pool_page_is_pp() as we 711 + * know this is a page_pool page. 712 712 */ 713 713 page_pool_recycle_direct(page->pp, page); 714 714 } while (++n < num);
+58
include/linux/mm.h
··· 4248 4248 #define VM_SEALED_SYSMAP VM_NONE 4249 4249 #endif 4250 4250 4251 + /* 4252 + * DMA mapping IDs for page_pool 4253 + * 4254 + * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and 4255 + * stashes it in the upper bits of page->pp_magic. We always want to be able to 4256 + * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP 4257 + * pages can have arbitrary kernel pointers stored in the same field as pp_magic 4258 + * (since it overlaps with page->lru.next), so we must ensure that we cannot 4259 + * mistake a valid kernel pointer with any of the values we write into this 4260 + * field. 4261 + * 4262 + * On architectures that set POISON_POINTER_DELTA, this is already ensured, 4263 + * since this value becomes part of PP_SIGNATURE; meaning we can just use the 4264 + * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the 4265 + * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is 4266 + * 0, we make sure that we leave the two topmost bits empty, as that guarantees 4267 + * we won't mistake a valid kernel pointer for a value we set, regardless of the 4268 + * VMSPLIT setting. 4269 + * 4270 + * Altogether, this means that the number of bits available is constrained by 4271 + * the size of an unsigned long (at the upper end, subtracting two bits per the 4272 + * above), and the definition of PP_SIGNATURE (with or without 4273 + * POISON_POINTER_DELTA). 4274 + */ 4275 + #define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) 4276 + #if POISON_POINTER_DELTA > 0 4277 + /* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA 4278 + * index to not overlap with that if set 4279 + */ 4280 + #define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) 4281 + #else 4282 + /* Always leave out the topmost two; see above. */ 4283 + #define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2) 4284 + #endif 4285 + 4286 + #define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ 4287 + PP_DMA_INDEX_SHIFT) 4288 + 4289 + /* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is 4290 + * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for 4291 + * the head page of compound page and bit 1 for pfmemalloc page, as well as the 4292 + * bits used for the DMA index. page_is_pfmemalloc() is checked in 4293 + * __page_pool_put_page() to avoid recycling the pfmemalloc page. 4294 + */ 4295 + #define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) 4296 + 4297 + #ifdef CONFIG_PAGE_POOL 4298 + static inline bool page_pool_page_is_pp(struct page *page) 4299 + { 4300 + return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; 4301 + } 4302 + #else 4303 + static inline bool page_pool_page_is_pp(struct page *page) 4304 + { 4305 + return false; 4306 + } 4307 + #endif 4308 + 4251 4309 #endif /* _LINUX_MM_H */
+4
include/linux/poison.h
··· 70 70 #define KEY_DESTROY 0xbd 71 71 72 72 /********** net/core/page_pool.c **********/ 73 + /* 74 + * page_pool uses additional free bits within this value to store data, see the 75 + * definition of PP_DMA_INDEX_MASK in mm.h 76 + */ 73 77 #define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) 74 78 75 79 /********** net/core/skbuff.c **********/
+6
include/net/page_pool/types.h
··· 6 6 #include <linux/dma-direction.h> 7 7 #include <linux/ptr_ring.h> 8 8 #include <linux/types.h> 9 + #include <linux/xarray.h> 9 10 #include <net/netmem.h> 10 11 11 12 #define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA ··· 33 32 34 33 #define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | \ 35 34 PP_FLAG_SYSTEM_POOL | PP_FLAG_ALLOW_UNREADABLE_NETMEM) 35 + 36 + /* Index limit to stay within PP_DMA_INDEX_BITS for DMA indices */ 37 + #define PP_DMA_INDEX_LIMIT XA_LIMIT(1, BIT(PP_DMA_INDEX_BITS) - 1) 36 38 37 39 /* 38 40 * Fast allocation side cache array/stack ··· 224 220 225 221 void *mp_priv; 226 222 const struct memory_provider_ops *mp_ops; 223 + 224 + struct xarray dma_mapped; 227 225 228 226 #ifdef CONFIG_PAGE_POOL_STATS 229 227 /* recycle stats are per-cpu to avoid locking */
+2 -6
mm/page_alloc.c
··· 897 897 #ifdef CONFIG_MEMCG 898 898 page->memcg_data | 899 899 #endif 900 - #ifdef CONFIG_PAGE_POOL 901 - ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) | 902 - #endif 900 + page_pool_page_is_pp(page) | 903 901 (page->flags & check_flags))) 904 902 return false; 905 903 ··· 924 926 if (unlikely(page->memcg_data)) 925 927 bad_reason = "page still charged to cgroup"; 926 928 #endif 927 - #ifdef CONFIG_PAGE_POOL 928 - if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE)) 929 + if (unlikely(page_pool_page_is_pp(page))) 929 930 bad_reason = "page_pool leak"; 930 - #endif 931 931 return bad_reason; 932 932 } 933 933
+32 -1
net/core/netmem_priv.h
··· 5 5 6 6 static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) 7 7 { 8 - return __netmem_clear_lsb(netmem)->pp_magic; 8 + return __netmem_clear_lsb(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; 9 9 } 10 10 11 11 static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) ··· 15 15 16 16 static inline void netmem_clear_pp_magic(netmem_ref netmem) 17 17 { 18 + WARN_ON_ONCE(__netmem_clear_lsb(netmem)->pp_magic & PP_DMA_INDEX_MASK); 19 + 18 20 __netmem_clear_lsb(netmem)->pp_magic = 0; 21 + } 22 + 23 + static inline bool netmem_is_pp(netmem_ref netmem) 24 + { 25 + return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE; 19 26 } 20 27 21 28 static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) ··· 34 27 unsigned long dma_addr) 35 28 { 36 29 __netmem_clear_lsb(netmem)->dma_addr = dma_addr; 30 + } 31 + 32 + static inline unsigned long netmem_get_dma_index(netmem_ref netmem) 33 + { 34 + unsigned long magic; 35 + 36 + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) 37 + return 0; 38 + 39 + magic = __netmem_clear_lsb(netmem)->pp_magic; 40 + 41 + return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT; 42 + } 43 + 44 + static inline void netmem_set_dma_index(netmem_ref netmem, 45 + unsigned long id) 46 + { 47 + unsigned long magic; 48 + 49 + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) 50 + return; 51 + 52 + magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT); 53 + __netmem_clear_lsb(netmem)->pp_magic = magic; 37 54 } 38 55 #endif
+68 -13
net/core/page_pool.c
··· 276 276 /* Driver calling page_pool_create() also call page_pool_destroy() */ 277 277 refcount_set(&pool->user_cnt, 1); 278 278 279 - if (pool->dma_map) 280 - get_device(pool->p.dev); 279 + xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); 281 280 282 281 if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { 283 282 netdev_assert_locked(pool->slow.netdev); ··· 319 320 static void page_pool_uninit(struct page_pool *pool) 320 321 { 321 322 ptr_ring_cleanup(&pool->ring, NULL); 322 - 323 - if (pool->dma_map) 324 - put_device(pool->p.dev); 323 + xa_destroy(&pool->dma_mapped); 325 324 326 325 #ifdef CONFIG_PAGE_POOL_STATS 327 326 if (!pool->system) ··· 460 463 netmem_ref netmem, 461 464 u32 dma_sync_size) 462 465 { 463 - if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) 464 - __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); 466 + if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) { 467 + rcu_read_lock(); 468 + /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ 469 + if (pool->dma_sync) 470 + __page_pool_dma_sync_for_device(pool, netmem, 471 + dma_sync_size); 472 + rcu_read_unlock(); 473 + } 465 474 } 466 475 467 - static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) 476 + static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) 468 477 { 469 478 dma_addr_t dma; 479 + int err; 480 + u32 id; 470 481 471 482 /* Setup DMA mapping: use 'struct page' area for storing DMA-addr 472 483 * since dma_addr_t can be either 32 or 64 bits and does not always fit ··· 488 483 if (dma_mapping_error(pool->p.dev, dma)) 489 484 return false; 490 485 491 - if (page_pool_set_dma_addr_netmem(netmem, dma)) 486 + if (page_pool_set_dma_addr_netmem(netmem, dma)) { 487 + WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); 492 488 goto unmap_failed; 489 + } 493 490 491 + if (in_softirq()) 492 + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), 493 + PP_DMA_INDEX_LIMIT, gfp); 494 + else 495 + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), 496 + PP_DMA_INDEX_LIMIT, gfp); 497 + if (err) { 498 + WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); 499 + goto unset_failed; 500 + } 501 + 502 + netmem_set_dma_index(netmem, id); 494 503 page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); 495 504 496 505 return true; 497 506 507 + unset_failed: 508 + page_pool_set_dma_addr_netmem(netmem, 0); 498 509 unmap_failed: 499 - WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); 500 510 dma_unmap_page_attrs(pool->p.dev, dma, 501 511 PAGE_SIZE << pool->p.order, pool->p.dma_dir, 502 512 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); ··· 528 508 if (unlikely(!page)) 529 509 return NULL; 530 510 531 - if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) { 511 + if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) { 532 512 put_page(page); 533 513 return NULL; 534 514 } ··· 574 554 */ 575 555 for (i = 0; i < nr_pages; i++) { 576 556 netmem = pool->alloc.cache[i]; 577 - if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) { 557 + if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) { 578 558 put_page(netmem_to_page(netmem)); 579 559 continue; 580 560 } ··· 676 656 static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, 677 657 netmem_ref netmem) 678 658 { 659 + struct page *old, *page = netmem_to_page(netmem); 660 + unsigned long id; 679 661 dma_addr_t dma; 680 662 681 663 if (!pool->dma_map) 682 664 /* Always account for inflight pages, even if we didn't 683 665 * map them 684 666 */ 667 + return; 668 + 669 + id = netmem_get_dma_index(netmem); 670 + if (!id) 671 + return; 672 + 673 + if (in_softirq()) 674 + old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); 675 + else 676 + old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); 677 + if (old != page) 685 678 return; 686 679 687 680 dma = page_pool_get_dma_addr_netmem(netmem); ··· 704 671 PAGE_SIZE << pool->p.order, pool->p.dma_dir, 705 672 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); 706 673 page_pool_set_dma_addr_netmem(netmem, 0); 674 + netmem_set_dma_index(netmem, 0); 707 675 } 708 676 709 677 /* Disconnects a page (from a page_pool). API users can have a need ··· 1114 1080 1115 1081 static void page_pool_scrub(struct page_pool *pool) 1116 1082 { 1083 + unsigned long id; 1084 + void *ptr; 1085 + 1117 1086 page_pool_empty_alloc_cache_once(pool); 1118 - pool->destroy_cnt++; 1087 + if (!pool->destroy_cnt++ && pool->dma_map) { 1088 + if (pool->dma_sync) { 1089 + /* Disable page_pool_dma_sync_for_device() */ 1090 + pool->dma_sync = false; 1091 + 1092 + /* Make sure all concurrent returns that may see the old 1093 + * value of dma_sync (and thus perform a sync) have 1094 + * finished before doing the unmapping below. Skip the 1095 + * wait if the device doesn't actually need syncing, or 1096 + * if there are no outstanding mapped pages. 1097 + */ 1098 + if (dma_dev_need_sync(pool->p.dev) && 1099 + !xa_empty(&pool->dma_mapped)) 1100 + synchronize_net(); 1101 + } 1102 + 1103 + xa_for_each(&pool->dma_mapped, id, ptr) 1104 + __page_pool_release_page_dma(pool, page_to_netmem(ptr)); 1105 + } 1119 1106 1120 1107 /* No more consumers should exist, but producers could still 1121 1108 * be in-flight.
+2 -14
net/core/skbuff.c
··· 893 893 skb_get(list); 894 894 } 895 895 896 - static bool is_pp_netmem(netmem_ref netmem) 897 - { 898 - return (netmem_get_pp_magic(netmem) & ~0x3UL) == PP_SIGNATURE; 899 - } 900 - 901 896 int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, 902 897 unsigned int headroom) 903 898 { ··· 990 995 { 991 996 netmem = netmem_compound_head(netmem); 992 997 993 - /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation 994 - * in order to preserve any existing bits, such as bit 0 for the 995 - * head page of compound page and bit 1 for pfmemalloc page, so 996 - * mask those bits for freeing side when doing below checking, 997 - * and page_is_pfmemalloc() is checked in __page_pool_put_page() 998 - * to avoid recycling the pfmemalloc page. 999 - */ 1000 - if (unlikely(!is_pp_netmem(netmem))) 998 + if (unlikely(!netmem_is_pp(netmem))) 1001 999 return false; 1002 1000 1003 1001 page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); ··· 1030 1042 1031 1043 for (i = 0; i < shinfo->nr_frags; i++) { 1032 1044 head_netmem = netmem_compound_head(shinfo->frags[i].netmem); 1033 - if (likely(is_pp_netmem(head_netmem))) 1045 + if (likely(netmem_is_pp(head_netmem))) 1034 1046 page_pool_ref_netmem(head_netmem); 1035 1047 else 1036 1048 page_ref_inc(netmem_to_page(head_netmem));
+2 -2
net/core/xdp.c
··· 438 438 netmem = netmem_compound_head(netmem); 439 439 if (napi_direct && xdp_return_frame_no_direct()) 440 440 napi_direct = false; 441 - /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) 442 - * as mem->type knows this a page_pool page 441 + /* No need to check netmem_is_pp() as mem->type knows this a 442 + * page_pool page 443 443 */ 444 444 page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, 445 445 napi_direct);