Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

xsk: avoid data corruption on cq descriptor number

Since commit 30f241fcf52a ("xsk: Fix immature cq descriptor
production"), the descriptor number is stored in skb control block and
xsk_cq_submit_addr_locked() relies on it to put the umem addrs onto
pool's completion queue.

skb control block shouldn't be used for this purpose as after transmit
xsk doesn't have control over it and other subsystems could use it. This
leads to the following kernel panic due to a NULL pointer dereference.

BUG: kernel NULL pointer dereference, address: 0000000000000000
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: Oops: 0000 [#1] SMP NOPTI
CPU: 2 UID: 1 PID: 927 Comm: p4xsk.bin Not tainted 6.16.12+deb14-cloud-amd64 #1 PREEMPT(lazy) Debian 6.16.12-1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
RIP: 0010:xsk_destruct_skb+0xd0/0x180
[...]
Call Trace:
<IRQ>
? napi_complete_done+0x7a/0x1a0
ip_rcv_core+0x1bb/0x340
ip_rcv+0x30/0x1f0
__netif_receive_skb_one_core+0x85/0xa0
process_backlog+0x87/0x130
__napi_poll+0x28/0x180
net_rx_action+0x339/0x420
handle_softirqs+0xdc/0x320
? handle_edge_irq+0x90/0x1e0
do_softirq.part.0+0x3b/0x60
</IRQ>
<TASK>
__local_bh_enable_ip+0x60/0x70
__dev_direct_xmit+0x14e/0x1f0
__xsk_generic_xmit+0x482/0xb70
? __remove_hrtimer+0x41/0xa0
? __xsk_generic_xmit+0x51/0xb70
? _raw_spin_unlock_irqrestore+0xe/0x40
xsk_sendmsg+0xda/0x1c0
__sys_sendto+0x1ee/0x200
__x64_sys_sendto+0x24/0x30
do_syscall_64+0x84/0x2f0
? __pfx_pollwake+0x10/0x10
? __rseq_handle_notify_resume+0xad/0x4c0
? restore_fpregs_from_fpstate+0x3c/0x90
? switch_fpu_return+0x5b/0xe0
? do_syscall_64+0x204/0x2f0
? do_syscall_64+0x204/0x2f0
? do_syscall_64+0x204/0x2f0
entry_SYSCALL_64_after_hwframe+0x76/0x7e
</TASK>
[...]
Kernel panic - not syncing: Fatal exception in interrupt
Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)

Instead use the skb destructor_arg pointer along with pointer tagging.
As pointers are always aligned to 8B, use the bottom bit to indicate
whether this a single address or an allocated struct containing several
addresses.

Fixes: 30f241fcf52a ("xsk: Fix immature cq descriptor production")
Closes: https://lore.kernel.org/netdev/0435b904-f44f-48f8-afb0-68868474bf1c@nop.hu/
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://patch.msgid.link/20251124171409.3845-1-fmancera@suse.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Fernando Fernandez Mancera and committed by
Jakub Kicinski
0ebc27a4 ae1737e7

+88 -55
+88 -55
net/xdp/xsk.c
··· 36 36 #define TX_BATCH_SIZE 32 37 37 #define MAX_PER_SOCKET_BUDGET 32 38 38 39 - struct xsk_addr_node { 40 - u64 addr; 41 - struct list_head addr_node; 42 - }; 43 - 44 - struct xsk_addr_head { 39 + struct xsk_addrs { 45 40 u32 num_descs; 46 - struct list_head addrs_list; 41 + u64 addrs[MAX_SKB_FRAGS + 1]; 47 42 }; 48 43 49 44 static struct kmem_cache *xsk_tx_generic_cache; 50 - 51 - #define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb)) 52 45 53 46 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) 54 47 { ··· 551 558 return ret; 552 559 } 553 560 561 + static bool xsk_skb_destructor_is_addr(struct sk_buff *skb) 562 + { 563 + return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL; 564 + } 565 + 566 + static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb) 567 + { 568 + return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL); 569 + } 570 + 571 + static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr) 572 + { 573 + skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); 574 + } 575 + 576 + static void xsk_inc_num_desc(struct sk_buff *skb) 577 + { 578 + struct xsk_addrs *xsk_addr; 579 + 580 + if (!xsk_skb_destructor_is_addr(skb)) { 581 + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 582 + xsk_addr->num_descs++; 583 + } 584 + } 585 + 586 + static u32 xsk_get_num_desc(struct sk_buff *skb) 587 + { 588 + struct xsk_addrs *xsk_addr; 589 + 590 + if (xsk_skb_destructor_is_addr(skb)) 591 + return 1; 592 + 593 + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 594 + 595 + return xsk_addr->num_descs; 596 + } 597 + 554 598 static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, 555 599 struct sk_buff *skb) 556 600 { 557 - struct xsk_addr_node *pos, *tmp; 601 + u32 num_descs = xsk_get_num_desc(skb); 602 + struct xsk_addrs *xsk_addr; 558 603 u32 descs_processed = 0; 559 604 unsigned long flags; 560 - u32 idx; 605 + u32 idx, i; 561 606 562 607 spin_lock_irqsave(&pool->cq_lock, flags); 563 608 idx = xskq_get_prod(pool->cq); 564 609 565 - xskq_prod_write_addr(pool->cq, idx, 566 - (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg); 567 - descs_processed++; 610 + if (unlikely(num_descs > 1)) { 611 + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 568 612 569 - if (unlikely(XSKCB(skb)->num_descs > 1)) { 570 - list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) { 613 + for (i = 0; i < num_descs; i++) { 571 614 xskq_prod_write_addr(pool->cq, idx + descs_processed, 572 - pos->addr); 615 + xsk_addr->addrs[i]); 573 616 descs_processed++; 574 - list_del(&pos->addr_node); 575 - kmem_cache_free(xsk_tx_generic_cache, pos); 576 617 } 618 + kmem_cache_free(xsk_tx_generic_cache, xsk_addr); 619 + } else { 620 + xskq_prod_write_addr(pool->cq, idx, 621 + xsk_skb_destructor_get_addr(skb)); 622 + descs_processed++; 577 623 } 578 624 xskq_prod_submit_n(pool->cq, descs_processed); 579 625 spin_unlock_irqrestore(&pool->cq_lock, flags); ··· 625 593 spin_lock_irqsave(&pool->cq_lock, flags); 626 594 xskq_prod_cancel_n(pool->cq, n); 627 595 spin_unlock_irqrestore(&pool->cq_lock, flags); 628 - } 629 - 630 - static void xsk_inc_num_desc(struct sk_buff *skb) 631 - { 632 - XSKCB(skb)->num_descs++; 633 - } 634 - 635 - static u32 xsk_get_num_desc(struct sk_buff *skb) 636 - { 637 - return XSKCB(skb)->num_descs; 638 596 } 639 597 640 598 static void xsk_destruct_skb(struct sk_buff *skb) ··· 643 621 static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, 644 622 u64 addr) 645 623 { 646 - BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb)); 647 - INIT_LIST_HEAD(&XSKCB(skb)->addrs_list); 648 624 skb->dev = xs->dev; 649 625 skb->priority = READ_ONCE(xs->sk.sk_priority); 650 626 skb->mark = READ_ONCE(xs->sk.sk_mark); 651 - XSKCB(skb)->num_descs = 0; 652 627 skb->destructor = xsk_destruct_skb; 653 - skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr; 628 + xsk_skb_destructor_set_addr(skb, addr); 654 629 } 655 630 656 631 static void xsk_consume_skb(struct sk_buff *skb) 657 632 { 658 633 struct xdp_sock *xs = xdp_sk(skb->sk); 659 634 u32 num_descs = xsk_get_num_desc(skb); 660 - struct xsk_addr_node *pos, *tmp; 635 + struct xsk_addrs *xsk_addr; 661 636 662 637 if (unlikely(num_descs > 1)) { 663 - list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) { 664 - list_del(&pos->addr_node); 665 - kmem_cache_free(xsk_tx_generic_cache, pos); 666 - } 638 + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 639 + kmem_cache_free(xsk_tx_generic_cache, xsk_addr); 667 640 } 668 641 669 642 skb->destructor = sock_wfree; ··· 718 701 { 719 702 struct xsk_buff_pool *pool = xs->pool; 720 703 u32 hr, len, ts, offset, copy, copied; 721 - struct xsk_addr_node *xsk_addr; 722 704 struct sk_buff *skb = xs->skb; 723 705 struct page *page; 724 706 void *buffer; ··· 743 727 return ERR_PTR(err); 744 728 } 745 729 } else { 746 - xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); 747 - if (!xsk_addr) 748 - return ERR_PTR(-ENOMEM); 730 + struct xsk_addrs *xsk_addr; 731 + 732 + if (xsk_skb_destructor_is_addr(skb)) { 733 + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, 734 + GFP_KERNEL); 735 + if (!xsk_addr) 736 + return ERR_PTR(-ENOMEM); 737 + 738 + xsk_addr->num_descs = 1; 739 + xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); 740 + skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; 741 + } else { 742 + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 743 + } 749 744 750 745 /* in case of -EOVERFLOW that could happen below, 751 746 * xsk_consume_skb() will release this node as whole skb 752 747 * would be dropped, which implies freeing all list elements 753 748 */ 754 - xsk_addr->addr = desc->addr; 755 - list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); 749 + xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; 756 750 } 757 751 758 752 len = desc->len; ··· 839 813 } 840 814 } else { 841 815 int nr_frags = skb_shinfo(skb)->nr_frags; 842 - struct xsk_addr_node *xsk_addr; 816 + struct xsk_addrs *xsk_addr; 843 817 struct page *page; 844 818 u8 *vaddr; 819 + 820 + if (xsk_skb_destructor_is_addr(skb)) { 821 + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, 822 + GFP_KERNEL); 823 + if (!xsk_addr) { 824 + err = -ENOMEM; 825 + goto free_err; 826 + } 827 + 828 + xsk_addr->num_descs = 1; 829 + xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); 830 + skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; 831 + } else { 832 + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; 833 + } 845 834 846 835 if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { 847 836 err = -EOVERFLOW; ··· 869 828 goto free_err; 870 829 } 871 830 872 - xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); 873 - if (!xsk_addr) { 874 - __free_page(page); 875 - err = -ENOMEM; 876 - goto free_err; 877 - } 878 - 879 831 vaddr = kmap_local_page(page); 880 832 memcpy(vaddr, buffer, len); 881 833 kunmap_local(vaddr); ··· 876 842 skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); 877 843 refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); 878 844 879 - xsk_addr->addr = desc->addr; 880 - list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); 845 + xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; 881 846 } 882 847 } 883 848 ··· 1937 1904 goto out_pernet; 1938 1905 1939 1906 xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache", 1940 - sizeof(struct xsk_addr_node), 1907 + sizeof(struct xsk_addrs), 1941 1908 0, SLAB_HWCACHE_ALIGN, NULL); 1942 1909 if (!xsk_tx_generic_cache) { 1943 1910 err = -ENOMEM;