Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Martin KaFai Lau says:

====================
pull-request: bpf-next 2025-11-10

We've added 19 non-merge commits during the last 3 day(s) which contain
a total of 22 files changed, 1345 insertions(+), 197 deletions(-).

The main changes are:

1) Preserve skb metadata after a TC BPF program has changed the skb,
from Jakub Sitnicki.
This allows a TC program at the end of a TC filter chain to still see
the skb metadata, even if another TC program at the front of the chain
has changed the skb using BPF helpers.

2) Initial af_smc bpf_struct_ops support to control the smc specific
syn/synack options, from D. Wythe.

* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next:
bpf/selftests: Add selftest for bpf_smc_hs_ctrl
net/smc: bpf: Introduce generic hook for handshake flow
bpf: Export necessary symbols for modules with struct_ops
selftests/bpf: Cover skb metadata access after bpf_skb_change_proto
selftests/bpf: Cover skb metadata access after change_head/tail helper
selftests/bpf: Cover skb metadata access after bpf_skb_adjust_room
selftests/bpf: Cover skb metadata access after vlan push/pop helper
selftests/bpf: Expect unclone to preserve skb metadata
selftests/bpf: Dump skb metadata on verification failure
selftests/bpf: Verify skb metadata in BPF instead of userspace
bpf: Make bpf_skb_change_head helper metadata-safe
bpf: Make bpf_skb_change_proto helper metadata-safe
bpf: Make bpf_skb_adjust_room metadata-safe
bpf: Make bpf_skb_vlan_push helper metadata-safe
bpf: Make bpf_skb_vlan_pop helper metadata-safe
vlan: Make vlan_remove_tag return nothing
bpf: Unclone skb head on bpf_dynptr_write to skb metadata
net: Preserve metadata on pskb_expand_head
net: Helper to move packet data and metadata after skb_push/pull
====================

Link: https://patch.msgid.link/20251110232427.3929291-1-martin.lau@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+1345 -197
+9
include/linux/filter.h
··· 1781 1781 void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len); 1782 1782 void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, 1783 1783 void *buf, unsigned long len, bool flush); 1784 + int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset, 1785 + const void *from, u32 len, u64 flags); 1784 1786 void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset); 1785 1787 #else /* CONFIG_NET */ 1786 1788 static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, ··· 1817 1815 static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, 1818 1816 unsigned long len, bool flush) 1819 1817 { 1818 + } 1819 + 1820 + static inline int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset, 1821 + const void *from, u32 len, 1822 + u64 flags) 1823 + { 1824 + return -EOPNOTSUPP; 1820 1825 } 1821 1826 1822 1827 static inline void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset)
+6 -7
include/linux/if_vlan.h
··· 355 355 __be16 vlan_proto, u16 vlan_tci, 356 356 unsigned int mac_len) 357 357 { 358 + const u8 meta_len = mac_len > ETH_TLEN ? skb_metadata_len(skb) : 0; 358 359 struct vlan_ethhdr *veth; 359 360 360 - if (skb_cow_head(skb, VLAN_HLEN) < 0) 361 + if (skb_cow_head(skb, meta_len + VLAN_HLEN) < 0) 361 362 return -ENOMEM; 362 363 363 364 skb_push(skb, VLAN_HLEN); 364 365 365 366 /* Move the mac header sans proto to the beginning of the new header. */ 366 367 if (likely(mac_len > ETH_TLEN)) 367 - memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN); 368 + skb_postpush_data_move(skb, VLAN_HLEN, mac_len - ETH_TLEN); 368 369 if (skb_mac_header_was_set(skb)) 369 370 skb->mac_header -= VLAN_HLEN; 370 371 ··· 732 731 * 733 732 * Expects the skb to contain a VLAN tag in the payload, and to have skb->data 734 733 * pointing at the MAC header. 735 - * 736 - * Returns: a new pointer to skb->data, or NULL on failure to pull. 737 734 */ 738 - static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) 735 + static inline void vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) 739 736 { 740 737 struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); 741 738 742 739 *vlan_tci = ntohs(vhdr->h_vlan_TCI); 743 740 744 - memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); 745 741 vlan_set_encap_proto(skb, vhdr); 746 - return __skb_pull(skb, VLAN_HLEN); 742 + __skb_pull(skb, VLAN_HLEN); 743 + skb_postpull_data_move(skb, VLAN_HLEN, 2 * ETH_ALEN); 747 744 } 748 745 749 746 /**
+75
include/linux/skbuff.h
··· 4564 4564 skb_metadata_set(skb, 0); 4565 4565 } 4566 4566 4567 + /** 4568 + * skb_data_move - Move packet data and metadata after skb_push() or skb_pull(). 4569 + * @skb: packet to operate on 4570 + * @len: number of bytes pushed or pulled from &sk_buff->data 4571 + * @n: number of bytes to memmove() from pre-push/pull &sk_buff->data 4572 + * 4573 + * Moves @n bytes of packet data, can be zero, and all bytes of skb metadata. 4574 + * 4575 + * Assumes metadata is located immediately before &sk_buff->data prior to the 4576 + * push/pull, and that sufficient headroom exists to hold it after an 4577 + * skb_push(). Otherwise, metadata is cleared and a one-time warning is issued. 4578 + * 4579 + * Prefer skb_postpull_data_move() or skb_postpush_data_move() to calling this 4580 + * helper directly. 4581 + */ 4582 + static inline void skb_data_move(struct sk_buff *skb, const int len, 4583 + const unsigned int n) 4584 + { 4585 + const u8 meta_len = skb_metadata_len(skb); 4586 + u8 *meta, *meta_end; 4587 + 4588 + if (!len || (!n && !meta_len)) 4589 + return; 4590 + 4591 + if (!meta_len) 4592 + goto no_metadata; 4593 + 4594 + meta_end = skb_metadata_end(skb); 4595 + meta = meta_end - meta_len; 4596 + 4597 + if (WARN_ON_ONCE(meta_end + len != skb->data || 4598 + meta_len > skb_headroom(skb))) { 4599 + skb_metadata_clear(skb); 4600 + goto no_metadata; 4601 + } 4602 + 4603 + memmove(meta + len, meta, meta_len + n); 4604 + return; 4605 + 4606 + no_metadata: 4607 + memmove(skb->data, skb->data - len, n); 4608 + } 4609 + 4610 + /** 4611 + * skb_postpull_data_move - Move packet data and metadata after skb_pull(). 4612 + * @skb: packet to operate on 4613 + * @len: number of bytes pulled from &sk_buff->data 4614 + * @n: number of bytes to memmove() from pre-pull &sk_buff->data 4615 + * 4616 + * See skb_data_move() for details. 4617 + */ 4618 + static inline void skb_postpull_data_move(struct sk_buff *skb, 4619 + const unsigned int len, 4620 + const unsigned int n) 4621 + { 4622 + DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); 4623 + skb_data_move(skb, len, n); 4624 + } 4625 + 4626 + /** 4627 + * skb_postpush_data_move - Move packet data and metadata after skb_push(). 4628 + * @skb: packet to operate on 4629 + * @len: number of bytes pushed onto &sk_buff->data 4630 + * @n: number of bytes to memmove() from pre-push &sk_buff->data 4631 + * 4632 + * See skb_data_move() for details. 4633 + */ 4634 + static inline void skb_postpush_data_move(struct sk_buff *skb, 4635 + const unsigned int len, 4636 + const unsigned int n) 4637 + { 4638 + DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); 4639 + skb_data_move(skb, -len, n); 4640 + } 4641 + 4567 4642 struct sk_buff *skb_clone_sk(struct sk_buff *skb); 4568 4643 4569 4644 #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
+3
include/net/netns/smc.h
··· 17 17 #ifdef CONFIG_SYSCTL 18 18 struct ctl_table_header *smc_hdr; 19 19 #endif 20 + #if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) 21 + struct smc_hs_ctrl __rcu *hs_ctrl; 22 + #endif /* CONFIG_SMC_HS_CTRL_BPF */ 20 23 unsigned int sysctl_autocorking_size; 21 24 unsigned int sysctl_smcr_buf_type; 22 25 int sysctl_smcr_testlink_time;
+53
include/net/smc.h
··· 17 17 #include <linux/wait.h> 18 18 #include <linux/dibs.h> 19 19 20 + struct tcp_sock; 21 + struct inet_request_sock; 20 22 struct sock; 21 23 22 24 #define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ ··· 51 49 wait_queue_head_t lgrs_deleted; 52 50 u8 going_away : 1; 53 51 }; 52 + 53 + #define SMC_HS_CTRL_NAME_MAX 16 54 + 55 + enum { 56 + /* ops can be inherit from init_net */ 57 + SMC_HS_CTRL_FLAG_INHERITABLE = 0x1, 58 + 59 + SMC_HS_CTRL_ALL_FLAGS = SMC_HS_CTRL_FLAG_INHERITABLE, 60 + }; 61 + 62 + struct smc_hs_ctrl { 63 + /* private */ 64 + 65 + struct list_head list; 66 + struct module *owner; 67 + 68 + /* public */ 69 + 70 + /* unique name */ 71 + char name[SMC_HS_CTRL_NAME_MAX]; 72 + int flags; 73 + 74 + /* Invoked before computing SMC option for SYN packets. 75 + * We can control whether to set SMC options by returning various value. 76 + * Return 0 to disable SMC, or return any other value to enable it. 77 + */ 78 + int (*syn_option)(struct tcp_sock *tp); 79 + 80 + /* Invoked before Set up SMC options for SYN-ACK packets 81 + * We can control whether to respond SMC options by returning various 82 + * value. Return 0 to disable SMC, or return any other value to enable 83 + * it. 84 + */ 85 + int (*synack_option)(const struct tcp_sock *tp, 86 + struct inet_request_sock *ireq); 87 + }; 88 + 89 + #if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) 90 + #define smc_call_hsbpf(init_val, tp, func, ...) ({ \ 91 + typeof(init_val) __ret = (init_val); \ 92 + struct smc_hs_ctrl *ctrl; \ 93 + rcu_read_lock(); \ 94 + ctrl = rcu_dereference(sock_net((struct sock *)(tp))->smc.hs_ctrl); \ 95 + if (ctrl && ctrl->func) \ 96 + __ret = ctrl->func(tp, ##__VA_ARGS__); \ 97 + rcu_read_unlock(); \ 98 + __ret; \ 99 + }) 100 + #else 101 + #define smc_call_hsbpf(init_val, tp, ...) ({ (void)(tp); (init_val); }) 102 + #endif /* CONFIG_SMC_HS_CTRL_BPF */ 54 103 55 104 #endif /* _SMC_H */
+2
kernel/bpf/bpf_struct_ops.c
··· 1162 1162 map = __bpf_map_inc_not_zero(&st_map->map, false); 1163 1163 return !IS_ERR(map); 1164 1164 } 1165 + EXPORT_SYMBOL_GPL(bpf_struct_ops_get); 1165 1166 1166 1167 void bpf_struct_ops_put(const void *kdata) 1167 1168 { ··· 1174 1173 1175 1174 bpf_map_put(&st_map->map); 1176 1175 } 1176 + EXPORT_SYMBOL_GPL(bpf_struct_ops_put); 1177 1177 1178 1178 u32 bpf_struct_ops_id(const void *kdata) 1179 1179 {
+2 -4
kernel/bpf/helpers.c
··· 1842 1842 return -EINVAL; 1843 1843 return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); 1844 1844 case BPF_DYNPTR_TYPE_SKB_META: 1845 - if (flags) 1846 - return -EINVAL; 1847 - memmove(bpf_skb_meta_pointer(dst->data, dst->offset + offset), src, len); 1848 - return 0; 1845 + return __bpf_skb_meta_store_bytes(dst->data, dst->offset + offset, src, 1846 + len, flags); 1849 1847 default: 1850 1848 WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); 1851 1849 return -EFAULT;
+1
kernel/bpf/syscall.c
··· 1234 1234 1235 1235 return src - orig_src; 1236 1236 } 1237 + EXPORT_SYMBOL_GPL(bpf_obj_name_cpy); 1237 1238 1238 1239 int map_check_no_btf(const struct bpf_map *map, 1239 1240 const struct btf *btf,
+22 -12
net/core/filter.c
··· 3253 3253 3254 3254 static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) 3255 3255 { 3256 - /* Caller already did skb_cow() with len as headroom, 3256 + /* Caller already did skb_cow() with meta_len+len as headroom, 3257 3257 * so no need to do it here. 3258 3258 */ 3259 3259 skb_push(skb, len); 3260 - memmove(skb->data, skb->data + len, off); 3260 + skb_postpush_data_move(skb, len, off); 3261 3261 memset(skb->data + off, 0, len); 3262 3262 3263 3263 /* No skb_postpush_rcsum(skb, skb->data + off, len) ··· 3281 3281 old_data = skb->data; 3282 3282 __skb_pull(skb, len); 3283 3283 skb_postpull_rcsum(skb, old_data + off, len); 3284 - memmove(skb->data, old_data, off); 3284 + skb_postpull_data_move(skb, len, off); 3285 3285 3286 3286 return 0; 3287 3287 } ··· 3326 3326 static int bpf_skb_proto_4_to_6(struct sk_buff *skb) 3327 3327 { 3328 3328 const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); 3329 + const u8 meta_len = skb_metadata_len(skb); 3329 3330 u32 off = skb_mac_header_len(skb); 3330 3331 int ret; 3331 3332 3332 - ret = skb_cow(skb, len_diff); 3333 + ret = skb_cow(skb, meta_len + len_diff); 3333 3334 if (unlikely(ret < 0)) 3334 3335 return ret; 3335 3336 ··· 3490 3489 u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT; 3491 3490 bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; 3492 3491 u16 mac_len = 0, inner_net = 0, inner_trans = 0; 3492 + const u8 meta_len = skb_metadata_len(skb); 3493 3493 unsigned int gso_type = SKB_GSO_DODGY; 3494 3494 int ret; 3495 3495 ··· 3501 3499 return -ENOTSUPP; 3502 3500 } 3503 3501 3504 - ret = skb_cow_head(skb, len_diff); 3502 + ret = skb_cow_head(skb, meta_len + len_diff); 3505 3503 if (unlikely(ret < 0)) 3506 3504 return ret; 3507 3505 ··· 3875 3873 static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, 3876 3874 u64 flags) 3877 3875 { 3876 + const u8 meta_len = skb_metadata_len(skb); 3878 3877 u32 max_len = BPF_SKB_MAX_LEN; 3879 3878 u32 new_len = skb->len + head_room; 3880 3879 int ret; ··· 3885 3882 new_len < skb->len)) 3886 3883 return -EINVAL; 3887 3884 3888 - ret = skb_cow(skb, head_room); 3885 + ret = skb_cow(skb, meta_len + head_room); 3889 3886 if (likely(!ret)) { 3890 3887 /* Idea for this helper is that we currently only 3891 3888 * allow to expand on mac header. This means that ··· 3897 3894 * for redirection into L2 device. 3898 3895 */ 3899 3896 __skb_push(skb, head_room); 3897 + skb_postpush_data_move(skb, head_room, 0); 3900 3898 memset(skb->data, 0, head_room); 3901 3899 skb_reset_mac_header(skb); 3902 3900 skb_reset_mac_len(skb); ··· 12106 12102 return skb_metadata_end(skb) - skb_metadata_len(skb) + offset; 12107 12103 } 12108 12104 12105 + int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset, 12106 + const void *from, u32 len, u64 flags) 12107 + { 12108 + if (unlikely(flags)) 12109 + return -EINVAL; 12110 + if (unlikely(bpf_try_make_writable(skb, 0))) 12111 + return -EFAULT; 12112 + 12113 + memmove(bpf_skb_meta_pointer(skb, offset), from, len); 12114 + return 0; 12115 + } 12116 + 12109 12117 __bpf_kfunc_start_defs(); 12110 12118 __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, 12111 12119 struct bpf_dynptr *ptr__uninit) ··· 12145 12129 * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to 12146 12130 * &__sk_buff->data_meta. 12147 12131 * 12148 - * If passed @skb_ is a clone which shares the data with the original, the 12149 - * dynptr will be read-only. This limitation may be lifted in the future. 12150 - * 12151 12132 * Return: 12152 12133 * * %0 - dynptr ready to use 12153 12134 * * %-EINVAL - invalid flags, dynptr set to null ··· 12161 12148 } 12162 12149 12163 12150 bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb)); 12164 - 12165 - if (skb_cloned(skb)) 12166 - bpf_dynptr_set_rdonly(ptr); 12167 12151 12168 12152 return 0; 12169 12153 }
+4 -2
net/core/skbuff.c
··· 2234 2234 * 2235 2235 * All the pointers pointing into skb header may change and must be 2236 2236 * reloaded after call to this function. 2237 + * 2238 + * Note: If you skb_push() the start of the buffer after reallocating the 2239 + * header, call skb_postpush_data_move() first to move the metadata out of 2240 + * the way before writing to &sk_buff->data. 2237 2241 */ 2238 2242 2239 2243 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, ··· 2308 2304 skb->hdr_len = 0; 2309 2305 skb->nohdr = 0; 2310 2306 atomic_set(&skb_shinfo(skb)->dataref, 1); 2311 - 2312 - skb_metadata_clear(skb); 2313 2307 2314 2308 /* It is not generally safe to change skb->truesize. 2315 2309 * For the moment, we really care of rx path, or
+17 -14
net/ipv4/tcp_output.c
··· 40 40 #include <net/tcp.h> 41 41 #include <net/tcp_ecn.h> 42 42 #include <net/mptcp.h> 43 + #include <net/smc.h> 43 44 #include <net/proto_memory.h> 44 45 #include <net/psp.h> 45 46 ··· 803 802 mptcp_options_write(th, ptr, tp, opts); 804 803 } 805 804 806 - static void smc_set_option(const struct tcp_sock *tp, 805 + static void smc_set_option(struct tcp_sock *tp, 807 806 struct tcp_out_options *opts, 808 807 unsigned int *remaining) 809 808 { 810 809 #if IS_ENABLED(CONFIG_SMC) 811 - if (static_branch_unlikely(&tcp_have_smc)) { 812 - if (tp->syn_smc) { 813 - if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { 814 - opts->options |= OPTION_SMC; 815 - *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; 816 - } 810 + if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc) { 811 + tp->syn_smc = !!smc_call_hsbpf(1, tp, syn_option); 812 + /* re-check syn_smc */ 813 + if (tp->syn_smc && 814 + *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { 815 + opts->options |= OPTION_SMC; 816 + *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; 817 817 } 818 818 } 819 819 #endif 820 820 } 821 821 822 822 static void smc_set_option_cond(const struct tcp_sock *tp, 823 - const struct inet_request_sock *ireq, 823 + struct inet_request_sock *ireq, 824 824 struct tcp_out_options *opts, 825 825 unsigned int *remaining) 826 826 { 827 827 #if IS_ENABLED(CONFIG_SMC) 828 - if (static_branch_unlikely(&tcp_have_smc)) { 829 - if (tp->syn_smc && ireq->smc_ok) { 830 - if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { 831 - opts->options |= OPTION_SMC; 832 - *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; 833 - } 828 + if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc && ireq->smc_ok) { 829 + ireq->smc_ok = !!smc_call_hsbpf(1, tp, synack_option, ireq); 830 + /* re-check smc_ok */ 831 + if (ireq->smc_ok && 832 + *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { 833 + opts->options |= OPTION_SMC; 834 + *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; 834 835 } 835 836 } 836 837 #endif
+10
net/smc/Kconfig
··· 19 19 smcss. 20 20 21 21 if unsure, say Y. 22 + 23 + config SMC_HS_CTRL_BPF 24 + bool "Generic eBPF hook for SMC handshake flow" 25 + depends on SMC && BPF_SYSCALL 26 + default y 27 + help 28 + SMC_HS_CTRL_BPF enables support to register generic eBPF hook for SMC 29 + handshake flow, which offer much greater flexibility in modifying the behavior 30 + of the SMC protocol stack compared to a complete kernel-based approach. Select 31 + this option if you want filtring the handshake process via eBPF programs.
+1
net/smc/Makefile
··· 6 6 smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o 7 7 smc-y += smc_tracepoint.o smc_inet.o 8 8 smc-$(CONFIG_SYSCTL) += smc_sysctl.o 9 + smc-$(CONFIG_SMC_HS_CTRL_BPF) += smc_hs_bpf.o
+9
net/smc/af_smc.c
··· 58 58 #include "smc_tracepoint.h" 59 59 #include "smc_sysctl.h" 60 60 #include "smc_inet.h" 61 + #include "smc_hs_bpf.h" 61 62 62 63 static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group 63 64 * creation on server ··· 3601 3600 pr_err("%s: smc_inet_init fails with %d\n", __func__, rc); 3602 3601 goto out_ulp; 3603 3602 } 3603 + rc = bpf_smc_hs_ctrl_init(); 3604 + if (rc) { 3605 + pr_err("%s: bpf_smc_hs_ctrl_init fails with %d\n", __func__, 3606 + rc); 3607 + goto out_inet; 3608 + } 3604 3609 static_branch_enable(&tcp_have_smc); 3605 3610 return 0; 3611 + out_inet: 3612 + smc_inet_exit(); 3606 3613 out_ulp: 3607 3614 tcp_unregister_ulp(&smc_ulp_ops); 3608 3615 out_ib:
+140
net/smc/smc_hs_bpf.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Shared Memory Communications over RDMA (SMC-R) and RoCE 4 + * 5 + * Generic hook for SMC handshake flow. 6 + * 7 + * Copyright IBM Corp. 2016 8 + * Copyright (c) 2025, Alibaba Inc. 9 + * 10 + * Author: D. Wythe <alibuda@linux.alibaba.com> 11 + */ 12 + 13 + #include <linux/bpf_verifier.h> 14 + #include <linux/bpf.h> 15 + #include <linux/btf.h> 16 + #include <linux/rculist.h> 17 + 18 + #include "smc_hs_bpf.h" 19 + 20 + static DEFINE_SPINLOCK(smc_hs_ctrl_list_lock); 21 + static LIST_HEAD(smc_hs_ctrl_list); 22 + 23 + static int smc_hs_ctrl_reg(struct smc_hs_ctrl *ctrl) 24 + { 25 + int ret = 0; 26 + 27 + spin_lock(&smc_hs_ctrl_list_lock); 28 + /* already exist or duplicate name */ 29 + if (smc_hs_ctrl_find_by_name(ctrl->name)) 30 + ret = -EEXIST; 31 + else 32 + list_add_tail_rcu(&ctrl->list, &smc_hs_ctrl_list); 33 + spin_unlock(&smc_hs_ctrl_list_lock); 34 + return ret; 35 + } 36 + 37 + static void smc_hs_ctrl_unreg(struct smc_hs_ctrl *ctrl) 38 + { 39 + spin_lock(&smc_hs_ctrl_list_lock); 40 + list_del_rcu(&ctrl->list); 41 + spin_unlock(&smc_hs_ctrl_list_lock); 42 + 43 + /* Ensure that all readers to complete */ 44 + synchronize_rcu(); 45 + } 46 + 47 + struct smc_hs_ctrl *smc_hs_ctrl_find_by_name(const char *name) 48 + { 49 + struct smc_hs_ctrl *ctrl; 50 + 51 + list_for_each_entry_rcu(ctrl, &smc_hs_ctrl_list, list) { 52 + if (strcmp(ctrl->name, name) == 0) 53 + return ctrl; 54 + } 55 + return NULL; 56 + } 57 + 58 + static int __smc_bpf_stub_set_tcp_option(struct tcp_sock *tp) { return 1; } 59 + static int __smc_bpf_stub_set_tcp_option_cond(const struct tcp_sock *tp, 60 + struct inet_request_sock *ireq) 61 + { 62 + return 1; 63 + } 64 + 65 + static struct smc_hs_ctrl __smc_bpf_hs_ctrl = { 66 + .syn_option = __smc_bpf_stub_set_tcp_option, 67 + .synack_option = __smc_bpf_stub_set_tcp_option_cond, 68 + }; 69 + 70 + static int smc_bpf_hs_ctrl_init(struct btf *btf) { return 0; } 71 + 72 + static int smc_bpf_hs_ctrl_reg(void *kdata, struct bpf_link *link) 73 + { 74 + if (link) 75 + return -EOPNOTSUPP; 76 + 77 + return smc_hs_ctrl_reg(kdata); 78 + } 79 + 80 + static void smc_bpf_hs_ctrl_unreg(void *kdata, struct bpf_link *link) 81 + { 82 + smc_hs_ctrl_unreg(kdata); 83 + } 84 + 85 + static int smc_bpf_hs_ctrl_init_member(const struct btf_type *t, 86 + const struct btf_member *member, 87 + void *kdata, const void *udata) 88 + { 89 + const struct smc_hs_ctrl *u_ctrl; 90 + struct smc_hs_ctrl *k_ctrl; 91 + u32 moff; 92 + 93 + u_ctrl = (const struct smc_hs_ctrl *)udata; 94 + k_ctrl = (struct smc_hs_ctrl *)kdata; 95 + 96 + moff = __btf_member_bit_offset(t, member) / 8; 97 + switch (moff) { 98 + case offsetof(struct smc_hs_ctrl, name): 99 + if (bpf_obj_name_cpy(k_ctrl->name, u_ctrl->name, 100 + sizeof(u_ctrl->name)) <= 0) 101 + return -EINVAL; 102 + return 1; 103 + case offsetof(struct smc_hs_ctrl, flags): 104 + if (u_ctrl->flags & ~SMC_HS_CTRL_ALL_FLAGS) 105 + return -EINVAL; 106 + k_ctrl->flags = u_ctrl->flags; 107 + return 1; 108 + default: 109 + break; 110 + } 111 + 112 + return 0; 113 + } 114 + 115 + static const struct bpf_func_proto * 116 + bpf_smc_hs_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 117 + { 118 + return bpf_base_func_proto(func_id, prog); 119 + } 120 + 121 + static const struct bpf_verifier_ops smc_bpf_verifier_ops = { 122 + .get_func_proto = bpf_smc_hs_func_proto, 123 + .is_valid_access = bpf_tracing_btf_ctx_access, 124 + }; 125 + 126 + static struct bpf_struct_ops bpf_smc_hs_ctrl_ops = { 127 + .name = "smc_hs_ctrl", 128 + .init = smc_bpf_hs_ctrl_init, 129 + .reg = smc_bpf_hs_ctrl_reg, 130 + .unreg = smc_bpf_hs_ctrl_unreg, 131 + .cfi_stubs = &__smc_bpf_hs_ctrl, 132 + .verifier_ops = &smc_bpf_verifier_ops, 133 + .init_member = smc_bpf_hs_ctrl_init_member, 134 + .owner = THIS_MODULE, 135 + }; 136 + 137 + int bpf_smc_hs_ctrl_init(void) 138 + { 139 + return register_bpf_struct_ops(&bpf_smc_hs_ctrl_ops, smc_hs_ctrl); 140 + }
+31
net/smc/smc_hs_bpf.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Shared Memory Communications over RDMA (SMC-R) and RoCE 4 + * 5 + * Generic hook for SMC handshake flow. 6 + * 7 + * Copyright IBM Corp. 2016 8 + * Copyright (c) 2025, Alibaba Inc. 9 + * 10 + * Author: D. Wythe <alibuda@linux.alibaba.com> 11 + */ 12 + 13 + #ifndef __SMC_HS_CTRL 14 + #define __SMC_HS_CTRL 15 + 16 + #include <net/smc.h> 17 + 18 + /* Find hs_ctrl by the target name, which required to be a c-string. 19 + * Return NULL if no such ctrl was found,otherwise, return a valid ctrl. 20 + * 21 + * Note: Caller MUST ensure it's was invoked under rcu_read_lock. 22 + */ 23 + struct smc_hs_ctrl *smc_hs_ctrl_find_by_name(const char *name); 24 + 25 + #if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) 26 + int bpf_smc_hs_ctrl_init(void); 27 + #else 28 + static inline int bpf_smc_hs_ctrl_init(void) { return 0; } 29 + #endif /* CONFIG_SMC_HS_CTRL_BPF */ 30 + 31 + #endif /* __SMC_HS_CTRL */
+91
net/smc/smc_sysctl.c
··· 12 12 13 13 #include <linux/init.h> 14 14 #include <linux/sysctl.h> 15 + #include <linux/bpf.h> 15 16 #include <net/net_namespace.h> 16 17 17 18 #include "smc.h" 18 19 #include "smc_core.h" 19 20 #include "smc_llc.h" 20 21 #include "smc_sysctl.h" 22 + #include "smc_hs_bpf.h" 21 23 22 24 static int min_sndbuf = SMC_BUF_MIN_SIZE; 23 25 static int min_rcvbuf = SMC_BUF_MIN_SIZE; ··· 33 31 static int conns_per_lgr_max = SMC_CONN_PER_LGR_MAX; 34 32 static unsigned int smcr_max_wr_min = 2; 35 33 static unsigned int smcr_max_wr_max = 2048; 34 + 35 + #if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) 36 + static int smc_net_replace_smc_hs_ctrl(struct net *net, const char *name) 37 + { 38 + struct smc_hs_ctrl *ctrl = NULL; 39 + 40 + rcu_read_lock(); 41 + /* null or empty name ask to clear current ctrl */ 42 + if (name && name[0]) { 43 + ctrl = smc_hs_ctrl_find_by_name(name); 44 + if (!ctrl) { 45 + rcu_read_unlock(); 46 + return -EINVAL; 47 + } 48 + /* no change, just return */ 49 + if (ctrl == rcu_dereference(net->smc.hs_ctrl)) { 50 + rcu_read_unlock(); 51 + return 0; 52 + } 53 + if (!bpf_try_module_get(ctrl, ctrl->owner)) { 54 + rcu_read_unlock(); 55 + return -EBUSY; 56 + } 57 + } 58 + /* xhcg old ctrl with the new one atomically */ 59 + ctrl = unrcu_pointer(xchg(&net->smc.hs_ctrl, RCU_INITIALIZER(ctrl))); 60 + /* release old ctrl */ 61 + if (ctrl) 62 + bpf_module_put(ctrl, ctrl->owner); 63 + 64 + rcu_read_unlock(); 65 + return 0; 66 + } 67 + 68 + static int proc_smc_hs_ctrl(const struct ctl_table *ctl, int write, 69 + void *buffer, size_t *lenp, loff_t *ppos) 70 + { 71 + struct net *net = container_of(ctl->data, struct net, smc.hs_ctrl); 72 + char val[SMC_HS_CTRL_NAME_MAX]; 73 + const struct ctl_table tbl = { 74 + .data = val, 75 + .maxlen = SMC_HS_CTRL_NAME_MAX, 76 + }; 77 + struct smc_hs_ctrl *ctrl; 78 + int ret; 79 + 80 + rcu_read_lock(); 81 + ctrl = rcu_dereference(net->smc.hs_ctrl); 82 + if (ctrl) 83 + memcpy(val, ctrl->name, sizeof(ctrl->name)); 84 + else 85 + val[0] = '\0'; 86 + rcu_read_unlock(); 87 + 88 + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); 89 + if (ret) 90 + return ret; 91 + 92 + if (write) 93 + ret = smc_net_replace_smc_hs_ctrl(net, val); 94 + return ret; 95 + } 96 + #endif /* CONFIG_SMC_HS_CTRL_BPF */ 36 97 37 98 static struct ctl_table smc_table[] = { 38 99 { ··· 184 119 .extra1 = &smcr_max_wr_min, 185 120 .extra2 = &smcr_max_wr_max, 186 121 }, 122 + #if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) 123 + { 124 + .procname = "hs_ctrl", 125 + .data = &init_net.smc.hs_ctrl, 126 + .mode = 0644, 127 + .maxlen = SMC_HS_CTRL_NAME_MAX, 128 + .proc_handler = proc_smc_hs_ctrl, 129 + }, 130 + #endif /* CONFIG_SMC_HS_CTRL_BPF */ 187 131 }; 188 132 189 133 int __net_init smc_sysctl_net_init(struct net *net) ··· 203 129 table = smc_table; 204 130 if (!net_eq(net, &init_net)) { 205 131 int i; 132 + #if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) 133 + struct smc_hs_ctrl *ctrl; 134 + 135 + rcu_read_lock(); 136 + ctrl = rcu_dereference(init_net.smc.hs_ctrl); 137 + if (ctrl && ctrl->flags & SMC_HS_CTRL_FLAG_INHERITABLE && 138 + bpf_try_module_get(ctrl, ctrl->owner)) 139 + rcu_assign_pointer(net->smc.hs_ctrl, ctrl); 140 + rcu_read_unlock(); 141 + #endif /* CONFIG_SMC_HS_CTRL_BPF */ 206 142 207 143 table = kmemdup(table, sizeof(smc_table), GFP_KERNEL); 208 144 if (!table) ··· 245 161 if (!net_eq(net, &init_net)) 246 162 kfree(table); 247 163 err_alloc: 164 + #if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) 165 + smc_net_replace_smc_hs_ctrl(net, NULL); 166 + #endif /* CONFIG_SMC_HS_CTRL_BPF */ 248 167 return -ENOMEM; 249 168 } 250 169 ··· 257 170 258 171 table = net->smc.smc_hdr->ctl_table_arg; 259 172 unregister_net_sysctl_table(net->smc.smc_hdr); 173 + #if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) 174 + smc_net_replace_smc_hs_ctrl(net, NULL); 175 + #endif /* CONFIG_SMC_HS_CTRL_BPF */ 176 + 260 177 if (!net_eq(net, &init_net)) 261 178 kfree(table); 262 179 }
+5
tools/testing/selftests/bpf/config
··· 123 123 CONFIG_XFRM_INTERFACE=y 124 124 CONFIG_TCP_CONG_DCTCP=y 125 125 CONFIG_TCP_CONG_BBR=y 126 + CONFIG_INFINIBAND=y 127 + CONFIG_SMC=y 128 + CONFIG_SMC_HS_CTRL_BPF=y 129 + CONFIG_DIBS=y 130 + CONFIG_DIBS_LO=y
+390
tools/testing/selftests/bpf/prog_tests/test_bpf_smc.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #include <test_progs.h> 3 + #include <linux/genetlink.h> 4 + #include "network_helpers.h" 5 + #include "bpf_smc.skel.h" 6 + 7 + #ifndef IPPROTO_SMC 8 + #define IPPROTO_SMC 256 9 + #endif 10 + 11 + #define CLIENT_IP "127.0.0.1" 12 + #define SERVER_IP "127.0.1.0" 13 + #define SERVER_IP_VIA_RISK_PATH "127.0.2.0" 14 + 15 + #define SERVICE_1 80 16 + #define SERVICE_2 443 17 + #define SERVICE_3 8443 18 + 19 + #define TEST_NS "bpf_smc_netns" 20 + 21 + static struct netns_obj *test_netns; 22 + 23 + struct smc_policy_ip_key { 24 + __u32 sip; 25 + __u32 dip; 26 + }; 27 + 28 + struct smc_policy_ip_value { 29 + __u8 mode; 30 + }; 31 + 32 + #if defined(__s390x__) 33 + /* s390x has default seid */ 34 + static bool setup_ueid(void) { return true; } 35 + static void cleanup_ueid(void) {} 36 + #else 37 + enum { 38 + SMC_NETLINK_ADD_UEID = 10, 39 + SMC_NETLINK_REMOVE_UEID 40 + }; 41 + 42 + enum { 43 + SMC_NLA_EID_TABLE_UNSPEC, 44 + SMC_NLA_EID_TABLE_ENTRY, /* string */ 45 + }; 46 + 47 + struct msgtemplate { 48 + struct nlmsghdr n; 49 + struct genlmsghdr g; 50 + char buf[1024]; 51 + }; 52 + 53 + #define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN)) 54 + #define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN) 55 + #define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN)) 56 + #define NLA_PAYLOAD(len) ((len) - NLA_HDRLEN) 57 + 58 + #define SMC_GENL_FAMILY_NAME "SMC_GEN_NETLINK" 59 + #define SMC_BPFTEST_UEID "SMC-BPFTEST-UEID" 60 + 61 + static uint16_t smc_nl_family_id = -1; 62 + 63 + static int send_cmd(int fd, __u16 nlmsg_type, __u32 nlmsg_pid, 64 + __u16 nlmsg_flags, __u8 genl_cmd, __u16 nla_type, 65 + void *nla_data, int nla_len) 66 + { 67 + struct nlattr *na; 68 + struct sockaddr_nl nladdr; 69 + int r, buflen; 70 + char *buf; 71 + 72 + struct msgtemplate msg = {0}; 73 + 74 + msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); 75 + msg.n.nlmsg_type = nlmsg_type; 76 + msg.n.nlmsg_flags = nlmsg_flags; 77 + msg.n.nlmsg_seq = 0; 78 + msg.n.nlmsg_pid = nlmsg_pid; 79 + msg.g.cmd = genl_cmd; 80 + msg.g.version = 1; 81 + na = (struct nlattr *)GENLMSG_DATA(&msg); 82 + na->nla_type = nla_type; 83 + na->nla_len = nla_len + 1 + NLA_HDRLEN; 84 + memcpy(NLA_DATA(na), nla_data, nla_len); 85 + msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len); 86 + 87 + buf = (char *)&msg; 88 + buflen = msg.n.nlmsg_len; 89 + memset(&nladdr, 0, sizeof(nladdr)); 90 + nladdr.nl_family = AF_NETLINK; 91 + 92 + while ((r = sendto(fd, buf, buflen, 0, (struct sockaddr *)&nladdr, 93 + sizeof(nladdr))) < buflen) { 94 + if (r > 0) { 95 + buf += r; 96 + buflen -= r; 97 + } else if (errno != EAGAIN) { 98 + return -1; 99 + } 100 + } 101 + return 0; 102 + } 103 + 104 + static bool get_smc_nl_family_id(void) 105 + { 106 + struct sockaddr_nl nl_src; 107 + struct msgtemplate msg; 108 + struct nlattr *nl; 109 + int fd, ret; 110 + pid_t pid; 111 + 112 + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); 113 + if (!ASSERT_OK_FD(fd, "nl_family socket")) 114 + return false; 115 + 116 + pid = getpid(); 117 + 118 + memset(&nl_src, 0, sizeof(nl_src)); 119 + nl_src.nl_family = AF_NETLINK; 120 + nl_src.nl_pid = pid; 121 + 122 + ret = bind(fd, (struct sockaddr *)&nl_src, sizeof(nl_src)); 123 + if (!ASSERT_OK(ret, "nl_family bind")) 124 + goto fail; 125 + 126 + ret = send_cmd(fd, GENL_ID_CTRL, pid, 127 + NLM_F_REQUEST, CTRL_CMD_GETFAMILY, 128 + CTRL_ATTR_FAMILY_NAME, (void *)SMC_GENL_FAMILY_NAME, 129 + strlen(SMC_GENL_FAMILY_NAME)); 130 + if (!ASSERT_OK(ret, "nl_family query")) 131 + goto fail; 132 + 133 + ret = recv(fd, &msg, sizeof(msg), 0); 134 + if (!ASSERT_FALSE(msg.n.nlmsg_type == NLMSG_ERROR || ret < 0 || 135 + !NLMSG_OK(&msg.n, ret), "nl_family response")) 136 + goto fail; 137 + 138 + nl = (struct nlattr *)GENLMSG_DATA(&msg); 139 + nl = (struct nlattr *)((char *)nl + NLA_ALIGN(nl->nla_len)); 140 + if (!ASSERT_EQ(nl->nla_type, CTRL_ATTR_FAMILY_ID, "nl_family nla type")) 141 + goto fail; 142 + 143 + smc_nl_family_id = *(uint16_t *)NLA_DATA(nl); 144 + close(fd); 145 + return true; 146 + fail: 147 + close(fd); 148 + return false; 149 + } 150 + 151 + static bool smc_ueid(int op) 152 + { 153 + struct sockaddr_nl nl_src; 154 + struct msgtemplate msg; 155 + struct nlmsgerr *err; 156 + char test_ueid[32]; 157 + int fd, ret; 158 + pid_t pid; 159 + 160 + /* UEID required */ 161 + memset(test_ueid, '\x20', sizeof(test_ueid)); 162 + memcpy(test_ueid, SMC_BPFTEST_UEID, strlen(SMC_BPFTEST_UEID)); 163 + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); 164 + if (!ASSERT_OK_FD(fd, "ueid socket")) 165 + return false; 166 + 167 + pid = getpid(); 168 + memset(&nl_src, 0, sizeof(nl_src)); 169 + nl_src.nl_family = AF_NETLINK; 170 + nl_src.nl_pid = pid; 171 + 172 + ret = bind(fd, (struct sockaddr *)&nl_src, sizeof(nl_src)); 173 + if (!ASSERT_OK(ret, "ueid bind")) 174 + goto fail; 175 + 176 + ret = send_cmd(fd, smc_nl_family_id, pid, 177 + NLM_F_REQUEST | NLM_F_ACK, op, SMC_NLA_EID_TABLE_ENTRY, 178 + (void *)test_ueid, sizeof(test_ueid)); 179 + if (!ASSERT_OK(ret, "ueid cmd")) 180 + goto fail; 181 + 182 + ret = recv(fd, &msg, sizeof(msg), 0); 183 + if (!ASSERT_FALSE(ret < 0 || 184 + !NLMSG_OK(&msg.n, ret), "ueid response")) 185 + goto fail; 186 + 187 + if (msg.n.nlmsg_type == NLMSG_ERROR) { 188 + err = NLMSG_DATA(&msg); 189 + switch (op) { 190 + case SMC_NETLINK_REMOVE_UEID: 191 + if (!ASSERT_FALSE((err->error && err->error != -ENOENT), 192 + "ueid remove")) 193 + goto fail; 194 + break; 195 + case SMC_NETLINK_ADD_UEID: 196 + if (!ASSERT_OK(err->error, "ueid add")) 197 + goto fail; 198 + break; 199 + default: 200 + break; 201 + } 202 + } 203 + close(fd); 204 + return true; 205 + fail: 206 + close(fd); 207 + return false; 208 + } 209 + 210 + static bool setup_ueid(void) 211 + { 212 + /* get smc nl id */ 213 + if (!get_smc_nl_family_id()) 214 + return false; 215 + /* clear old ueid for bpftest */ 216 + smc_ueid(SMC_NETLINK_REMOVE_UEID); 217 + /* smc-loopback required ueid */ 218 + return smc_ueid(SMC_NETLINK_ADD_UEID); 219 + } 220 + 221 + static void cleanup_ueid(void) 222 + { 223 + smc_ueid(SMC_NETLINK_REMOVE_UEID); 224 + } 225 + #endif /* __s390x__ */ 226 + 227 + static bool setup_netns(void) 228 + { 229 + test_netns = netns_new(TEST_NS, true); 230 + if (!ASSERT_OK_PTR(test_netns, "open net namespace")) 231 + goto fail_netns; 232 + 233 + SYS(fail_ip, "ip addr add 127.0.1.0/8 dev lo"); 234 + SYS(fail_ip, "ip addr add 127.0.2.0/8 dev lo"); 235 + 236 + return true; 237 + fail_ip: 238 + netns_free(test_netns); 239 + fail_netns: 240 + return false; 241 + } 242 + 243 + static void cleanup_netns(void) 244 + { 245 + netns_free(test_netns); 246 + } 247 + 248 + static bool setup_smc(void) 249 + { 250 + if (!setup_ueid()) 251 + return false; 252 + 253 + if (!setup_netns()) 254 + goto fail_netns; 255 + 256 + return true; 257 + fail_netns: 258 + cleanup_ueid(); 259 + return false; 260 + } 261 + 262 + static int set_client_addr_cb(int fd, void *opts) 263 + { 264 + const char *src = (const char *)opts; 265 + struct sockaddr_in localaddr; 266 + 267 + localaddr.sin_family = AF_INET; 268 + localaddr.sin_port = htons(0); 269 + localaddr.sin_addr.s_addr = inet_addr(src); 270 + return !ASSERT_OK(bind(fd, &localaddr, sizeof(localaddr)), "client bind"); 271 + } 272 + 273 + static void run_link(const char *src, const char *dst, int port) 274 + { 275 + struct network_helper_opts opts = {0}; 276 + int server, client; 277 + 278 + server = start_server_str(AF_INET, SOCK_STREAM, dst, port, NULL); 279 + if (!ASSERT_OK_FD(server, "start service_1")) 280 + return; 281 + 282 + opts.proto = IPPROTO_TCP; 283 + opts.post_socket_cb = set_client_addr_cb; 284 + opts.cb_opts = (void *)src; 285 + 286 + client = connect_to_fd_opts(server, &opts); 287 + if (!ASSERT_OK_FD(client, "start connect")) 288 + goto fail_client; 289 + 290 + close(client); 291 + fail_client: 292 + close(server); 293 + } 294 + 295 + static void block_link(int map_fd, const char *src, const char *dst) 296 + { 297 + struct smc_policy_ip_value val = { .mode = /* block */ 0 }; 298 + struct smc_policy_ip_key key = { 299 + .sip = inet_addr(src), 300 + .dip = inet_addr(dst), 301 + }; 302 + 303 + bpf_map_update_elem(map_fd, &key, &val, BPF_ANY); 304 + } 305 + 306 + /* 307 + * This test describes a real-life service topology as follows: 308 + * 309 + * +-------------> service_1 310 + * link 1 | | 311 + * +--------------------> server | link 2 312 + * | | V 313 + * | +-------------> service_2 314 + * | link 3 315 + * client -------------------> server_via_unsafe_path -> service_3 316 + * 317 + * Among them, 318 + * 1. link-1 is very suitable for using SMC. 319 + * 2. link-2 is not suitable for using SMC, because the mode of this link is 320 + * kind of short-link services. 321 + * 3. link-3 is also not suitable for using SMC, because the RDMA link is 322 + * unavailable and needs to go through a long timeout before it can fallback 323 + * to TCP. 324 + * To achieve this goal, we use a customized SMC ip strategy via smc_hs_ctrl. 325 + */ 326 + static void test_topo(void) 327 + { 328 + struct bpf_smc *skel; 329 + int rc, map_fd; 330 + 331 + skel = bpf_smc__open_and_load(); 332 + if (!ASSERT_OK_PTR(skel, "bpf_smc__open_and_load")) 333 + return; 334 + 335 + rc = bpf_smc__attach(skel); 336 + if (!ASSERT_OK(rc, "bpf_smc__attach")) 337 + goto fail; 338 + 339 + map_fd = bpf_map__fd(skel->maps.smc_policy_ip); 340 + if (!ASSERT_OK_FD(map_fd, "bpf_map__fd")) 341 + goto fail; 342 + 343 + /* Mock the process of transparent replacement, since we will modify 344 + * protocol to ipproto_smc accropding to it via 345 + * fmod_ret/update_socket_protocol. 346 + */ 347 + write_sysctl("/proc/sys/net/smc/hs_ctrl", "linkcheck"); 348 + 349 + /* Configure ip strat */ 350 + block_link(map_fd, CLIENT_IP, SERVER_IP_VIA_RISK_PATH); 351 + block_link(map_fd, SERVER_IP, SERVER_IP); 352 + 353 + /* should go with smc */ 354 + run_link(CLIENT_IP, SERVER_IP, SERVICE_1); 355 + /* should go with smc fallback */ 356 + run_link(SERVER_IP, SERVER_IP, SERVICE_2); 357 + 358 + ASSERT_EQ(skel->bss->smc_cnt, 2, "smc count"); 359 + ASSERT_EQ(skel->bss->fallback_cnt, 1, "fallback count"); 360 + 361 + /* should go with smc */ 362 + run_link(CLIENT_IP, SERVER_IP, SERVICE_2); 363 + 364 + ASSERT_EQ(skel->bss->smc_cnt, 3, "smc count"); 365 + ASSERT_EQ(skel->bss->fallback_cnt, 1, "fallback count"); 366 + 367 + /* should go with smc fallback */ 368 + run_link(CLIENT_IP, SERVER_IP_VIA_RISK_PATH, SERVICE_3); 369 + 370 + ASSERT_EQ(skel->bss->smc_cnt, 4, "smc count"); 371 + ASSERT_EQ(skel->bss->fallback_cnt, 2, "fallback count"); 372 + 373 + fail: 374 + bpf_smc__destroy(skel); 375 + } 376 + 377 + void test_bpf_smc(void) 378 + { 379 + if (!setup_smc()) { 380 + printf("setup for smc test failed, test SKIP:\n"); 381 + test__skip(); 382 + return; 383 + } 384 + 385 + if (test__start_subtest("topo")) 386 + test_topo(); 387 + 388 + cleanup_ueid(); 389 + cleanup_netns(); 390 + }
+75 -54
tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c
··· 124 124 int n, sock = -1; 125 125 __u8 packet[sizeof(struct ethhdr) + TEST_PAYLOAD_LEN]; 126 126 127 - /* The ethernet header is not relevant for this test and doesn't need to 128 - * be meaningful. 129 - */ 130 - struct ethhdr eth = { 0 }; 127 + /* We use the Ethernet header only to identify the test packet */ 128 + struct ethhdr eth = { 129 + .h_source = { 0x12, 0x34, 0xDE, 0xAD, 0xBE, 0xEF }, 130 + }; 131 131 132 132 memcpy(packet, &eth, sizeof(eth)); 133 133 memcpy(packet + sizeof(eth), test_payload, TEST_PAYLOAD_LEN); ··· 160 160 __u8 packet[sizeof(struct ethhdr) + TEST_PAYLOAD_LEN]; 161 161 int n; 162 162 163 - /* The ethernet header doesn't need to be valid for this test */ 164 - memset(packet, 0, sizeof(struct ethhdr)); 163 + /* The Ethernet header is mostly not relevant. We use it to identify the 164 + * test packet and some BPF helpers we exercise expect to operate on 165 + * Ethernet frames carrying IP packets. Pretend that's the case. 166 + */ 167 + struct ethhdr eth = { 168 + .h_source = { 0x12, 0x34, 0xDE, 0xAD, 0xBE, 0xEF }, 169 + .h_proto = htons(ETH_P_IP), 170 + }; 171 + 172 + memcpy(packet, &eth, sizeof(eth)); 165 173 memcpy(packet + sizeof(struct ethhdr), test_payload, TEST_PAYLOAD_LEN); 166 174 167 175 n = write(tap_fd, packet, sizeof(packet)); ··· 179 171 return 0; 180 172 } 181 173 182 - static void assert_test_result(const struct bpf_map *result_map) 174 + static void dump_err_stream(const struct bpf_program *prog) 183 175 { 184 - int err; 185 - __u32 map_key = 0; 186 - __u8 map_value[TEST_PAYLOAD_LEN]; 176 + char buf[512]; 177 + int ret; 187 178 188 - err = bpf_map__lookup_elem(result_map, &map_key, sizeof(map_key), 189 - &map_value, TEST_PAYLOAD_LEN, BPF_ANY); 190 - if (!ASSERT_OK(err, "lookup test_result")) 191 - return; 192 - 193 - ASSERT_MEMEQ(&map_value, &test_payload, TEST_PAYLOAD_LEN, 194 - "test_result map contains test payload"); 195 - } 196 - 197 - static bool clear_test_result(struct bpf_map *result_map) 198 - { 199 - const __u8 v[sizeof(test_payload)] = {}; 200 - const __u32 k = 0; 201 - int err; 202 - 203 - err = bpf_map__update_elem(result_map, &k, sizeof(k), v, sizeof(v), BPF_ANY); 204 - ASSERT_OK(err, "update test_result"); 205 - 206 - return err == 0; 179 + ret = 0; 180 + do { 181 + ret = bpf_prog_stream_read(bpf_program__fd(prog), 182 + BPF_STREAM_STDERR, buf, sizeof(buf), 183 + NULL); 184 + if (ret > 0) 185 + fwrite(buf, sizeof(buf[0]), ret, stderr); 186 + } while (ret > 0); 207 187 } 208 188 209 189 void test_xdp_context_veth(void) ··· 266 270 if (!ASSERT_GE(tx_ifindex, 0, "if_nametoindex tx")) 267 271 goto close; 268 272 273 + skel->bss->test_pass = false; 274 + 269 275 ret = send_test_packet(tx_ifindex); 270 276 if (!ASSERT_OK(ret, "send_test_packet")) 271 277 goto close; 272 278 273 - assert_test_result(skel->maps.test_result); 279 + if (!ASSERT_TRUE(skel->bss->test_pass, "test_pass")) 280 + dump_err_stream(tc_prog); 274 281 275 282 close: 276 283 close_netns(nstoken); ··· 285 286 static void test_tuntap(struct bpf_program *xdp_prog, 286 287 struct bpf_program *tc_prio_1_prog, 287 288 struct bpf_program *tc_prio_2_prog, 288 - struct bpf_map *result_map) 289 + bool *test_pass) 289 290 { 290 291 LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS); 291 292 LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1); ··· 294 295 int tap_ifindex; 295 296 int ret; 296 297 297 - if (!clear_test_result(result_map)) 298 - return; 298 + *test_pass = false; 299 299 300 300 ns = netns_new(TAP_NETNS, true); 301 301 if (!ASSERT_OK_PTR(ns, "create and open ns")) ··· 338 340 if (!ASSERT_OK(ret, "write_test_packet")) 339 341 goto close; 340 342 341 - assert_test_result(result_map); 343 + if (!ASSERT_TRUE(*test_pass, "test_pass")) 344 + dump_err_stream(tc_prio_2_prog ? : tc_prio_1_prog); 342 345 343 346 close: 344 347 if (tap_fd >= 0) ··· 410 411 if (!ASSERT_OK(ret, "write_test_packet")) 411 412 goto close; 412 413 413 - ASSERT_TRUE(*test_pass, "test_pass"); 414 + if (!ASSERT_TRUE(*test_pass, "test_pass")) 415 + dump_err_stream(tc_prog); 414 416 415 417 close: 416 418 if (tap_fd >= 0) ··· 431 431 test_tuntap(skel->progs.ing_xdp, 432 432 skel->progs.ing_cls, 433 433 NULL, /* tc prio 2 */ 434 - skel->maps.test_result); 434 + &skel->bss->test_pass); 435 435 if (test__start_subtest("dynptr_read")) 436 436 test_tuntap(skel->progs.ing_xdp, 437 437 skel->progs.ing_cls_dynptr_read, 438 438 NULL, /* tc prio 2 */ 439 - skel->maps.test_result); 439 + &skel->bss->test_pass); 440 440 if (test__start_subtest("dynptr_slice")) 441 441 test_tuntap(skel->progs.ing_xdp, 442 442 skel->progs.ing_cls_dynptr_slice, 443 443 NULL, /* tc prio 2 */ 444 - skel->maps.test_result); 444 + &skel->bss->test_pass); 445 445 if (test__start_subtest("dynptr_write")) 446 446 test_tuntap(skel->progs.ing_xdp_zalloc_meta, 447 447 skel->progs.ing_cls_dynptr_write, 448 448 skel->progs.ing_cls_dynptr_read, 449 - skel->maps.test_result); 449 + &skel->bss->test_pass); 450 450 if (test__start_subtest("dynptr_slice_rdwr")) 451 451 test_tuntap(skel->progs.ing_xdp_zalloc_meta, 452 452 skel->progs.ing_cls_dynptr_slice_rdwr, 453 453 skel->progs.ing_cls_dynptr_slice, 454 - skel->maps.test_result); 454 + &skel->bss->test_pass); 455 455 if (test__start_subtest("dynptr_offset")) 456 456 test_tuntap(skel->progs.ing_xdp_zalloc_meta, 457 457 skel->progs.ing_cls_dynptr_offset_wr, 458 458 skel->progs.ing_cls_dynptr_offset_rd, 459 - skel->maps.test_result); 459 + &skel->bss->test_pass); 460 460 if (test__start_subtest("dynptr_offset_oob")) 461 461 test_tuntap(skel->progs.ing_xdp, 462 462 skel->progs.ing_cls_dynptr_offset_oob, 463 463 skel->progs.ing_cls, 464 - skel->maps.test_result); 465 - if (test__start_subtest("clone_data_meta_empty_on_data_write")) 464 + &skel->bss->test_pass); 465 + if (test__start_subtest("clone_data_meta_survives_data_write")) 466 466 test_tuntap_mirred(skel->progs.ing_xdp, 467 - skel->progs.clone_data_meta_empty_on_data_write, 467 + skel->progs.clone_data_meta_survives_data_write, 468 468 &skel->bss->test_pass); 469 - if (test__start_subtest("clone_data_meta_empty_on_meta_write")) 469 + if (test__start_subtest("clone_data_meta_survives_meta_write")) 470 470 test_tuntap_mirred(skel->progs.ing_xdp, 471 - skel->progs.clone_data_meta_empty_on_meta_write, 471 + skel->progs.clone_data_meta_survives_meta_write, 472 472 &skel->bss->test_pass); 473 - if (test__start_subtest("clone_dynptr_empty_on_data_slice_write")) 473 + if (test__start_subtest("clone_meta_dynptr_survives_data_slice_write")) 474 474 test_tuntap_mirred(skel->progs.ing_xdp, 475 - skel->progs.clone_dynptr_empty_on_data_slice_write, 475 + skel->progs.clone_meta_dynptr_survives_data_slice_write, 476 476 &skel->bss->test_pass); 477 - if (test__start_subtest("clone_dynptr_empty_on_meta_slice_write")) 477 + if (test__start_subtest("clone_meta_dynptr_survives_meta_slice_write")) 478 478 test_tuntap_mirred(skel->progs.ing_xdp, 479 - skel->progs.clone_dynptr_empty_on_meta_slice_write, 479 + skel->progs.clone_meta_dynptr_survives_meta_slice_write, 480 480 &skel->bss->test_pass); 481 - if (test__start_subtest("clone_dynptr_rdonly_before_data_dynptr_write")) 481 + if (test__start_subtest("clone_meta_dynptr_rw_before_data_dynptr_write")) 482 482 test_tuntap_mirred(skel->progs.ing_xdp, 483 - skel->progs.clone_dynptr_rdonly_before_data_dynptr_write, 483 + skel->progs.clone_meta_dynptr_rw_before_data_dynptr_write, 484 484 &skel->bss->test_pass); 485 - if (test__start_subtest("clone_dynptr_rdonly_before_meta_dynptr_write")) 485 + if (test__start_subtest("clone_meta_dynptr_rw_before_meta_dynptr_write")) 486 486 test_tuntap_mirred(skel->progs.ing_xdp, 487 - skel->progs.clone_dynptr_rdonly_before_meta_dynptr_write, 487 + skel->progs.clone_meta_dynptr_rw_before_meta_dynptr_write, 488 488 &skel->bss->test_pass); 489 + /* Tests for BPF helpers which touch headroom */ 490 + if (test__start_subtest("helper_skb_vlan_push_pop")) 491 + test_tuntap(skel->progs.ing_xdp, 492 + skel->progs.helper_skb_vlan_push_pop, 493 + NULL, /* tc prio 2 */ 494 + &skel->bss->test_pass); 495 + if (test__start_subtest("helper_skb_adjust_room")) 496 + test_tuntap(skel->progs.ing_xdp, 497 + skel->progs.helper_skb_adjust_room, 498 + NULL, /* tc prio 2 */ 499 + &skel->bss->test_pass); 500 + if (test__start_subtest("helper_skb_change_head_tail")) 501 + test_tuntap(skel->progs.ing_xdp, 502 + skel->progs.helper_skb_change_head_tail, 503 + NULL, /* tc prio 2 */ 504 + &skel->bss->test_pass); 505 + if (test__start_subtest("helper_skb_change_proto")) 506 + test_tuntap(skel->progs.ing_xdp, 507 + skel->progs.helper_skb_change_proto, 508 + NULL, /* tc prio 2 */ 509 + &skel->bss->test_pass); 489 510 490 511 test_xdp_meta__destroy(skel); 491 512 }
+117
tools/testing/selftests/bpf/progs/bpf_smc.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include "vmlinux.h" 4 + 5 + #include <bpf/bpf_helpers.h> 6 + #include <bpf/bpf_tracing.h> 7 + #include "bpf_tracing_net.h" 8 + 9 + char _license[] SEC("license") = "GPL"; 10 + 11 + enum { 12 + BPF_SMC_LISTEN = 10, 13 + }; 14 + 15 + struct smc_sock___local { 16 + struct sock sk; 17 + struct smc_sock *listen_smc; 18 + bool use_fallback; 19 + } __attribute__((preserve_access_index)); 20 + 21 + int smc_cnt = 0; 22 + int fallback_cnt = 0; 23 + 24 + SEC("fentry/smc_release") 25 + int BPF_PROG(bpf_smc_release, struct socket *sock) 26 + { 27 + /* only count from one side (client) */ 28 + if (sock->sk->__sk_common.skc_state == BPF_SMC_LISTEN) 29 + return 0; 30 + smc_cnt++; 31 + return 0; 32 + } 33 + 34 + SEC("fentry/smc_switch_to_fallback") 35 + int BPF_PROG(bpf_smc_switch_to_fallback, struct smc_sock___local *smc) 36 + { 37 + /* only count from one side (client) */ 38 + if (smc && !smc->listen_smc) 39 + fallback_cnt++; 40 + return 0; 41 + } 42 + 43 + /* go with default value if no strat was found */ 44 + bool default_ip_strat_value = true; 45 + 46 + struct smc_policy_ip_key { 47 + __u32 sip; 48 + __u32 dip; 49 + }; 50 + 51 + struct smc_policy_ip_value { 52 + __u8 mode; 53 + }; 54 + 55 + struct { 56 + __uint(type, BPF_MAP_TYPE_HASH); 57 + __uint(key_size, sizeof(struct smc_policy_ip_key)); 58 + __uint(value_size, sizeof(struct smc_policy_ip_value)); 59 + __uint(max_entries, 128); 60 + __uint(map_flags, BPF_F_NO_PREALLOC); 61 + } smc_policy_ip SEC(".maps"); 62 + 63 + static bool smc_check(__u32 src, __u32 dst) 64 + { 65 + struct smc_policy_ip_value *value; 66 + struct smc_policy_ip_key key = { 67 + .sip = src, 68 + .dip = dst, 69 + }; 70 + 71 + value = bpf_map_lookup_elem(&smc_policy_ip, &key); 72 + return value ? value->mode : default_ip_strat_value; 73 + } 74 + 75 + SEC("fmod_ret/update_socket_protocol") 76 + int BPF_PROG(smc_run, int family, int type, int protocol) 77 + { 78 + struct task_struct *task; 79 + 80 + if (family != AF_INET && family != AF_INET6) 81 + return protocol; 82 + 83 + if ((type & 0xf) != SOCK_STREAM) 84 + return protocol; 85 + 86 + if (protocol != 0 && protocol != IPPROTO_TCP) 87 + return protocol; 88 + 89 + task = bpf_get_current_task_btf(); 90 + /* Prevent from affecting other tests */ 91 + if (!task || !task->nsproxy->net_ns->smc.hs_ctrl) 92 + return protocol; 93 + 94 + return IPPROTO_SMC; 95 + } 96 + 97 + SEC("struct_ops") 98 + int BPF_PROG(bpf_smc_set_tcp_option_cond, const struct tcp_sock *tp, 99 + struct inet_request_sock *ireq) 100 + { 101 + return smc_check(ireq->req.__req_common.skc_daddr, 102 + ireq->req.__req_common.skc_rcv_saddr); 103 + } 104 + 105 + SEC("struct_ops") 106 + int BPF_PROG(bpf_smc_set_tcp_option, struct tcp_sock *tp) 107 + { 108 + return smc_check(tp->inet_conn.icsk_inet.sk.__sk_common.skc_rcv_saddr, 109 + tp->inet_conn.icsk_inet.sk.__sk_common.skc_daddr); 110 + } 111 + 112 + SEC(".struct_ops") 113 + struct smc_hs_ctrl linkcheck = { 114 + .name = "linkcheck", 115 + .syn_option = (void *)bpf_smc_set_tcp_option, 116 + .synack_option = (void *)bpf_smc_set_tcp_option_cond, 117 + };
+282 -104
tools/testing/selftests/bpf/progs/test_xdp_meta.c
··· 4 4 #include <linux/if_ether.h> 5 5 #include <linux/pkt_cls.h> 6 6 7 + #include <bpf/bpf_endian.h> 7 8 #include <bpf/bpf_helpers.h> 8 9 #include "bpf_kfuncs.h" 9 10 ··· 12 11 13 12 #define ctx_ptr(ctx, mem) (void *)(unsigned long)ctx->mem 14 13 15 - /* Demonstrates how metadata can be passed from an XDP program to a TC program 16 - * using bpf_xdp_adjust_meta. 17 - * For the sake of testing the metadata support in drivers, the XDP program uses 18 - * a fixed-size payload after the Ethernet header as metadata. The TC program 19 - * copies the metadata it receives into a map so it can be checked from 20 - * userspace. 14 + /* Demonstrate passing metadata from XDP to TC using bpf_xdp_adjust_meta. 15 + * 16 + * The XDP program extracts a fixed-size payload following the Ethernet header 17 + * and stores it as packet metadata to test the driver's metadata support. The 18 + * TC program then verifies if the passed metadata is correct. 21 19 */ 22 20 23 - struct { 24 - __uint(type, BPF_MAP_TYPE_ARRAY); 25 - __uint(max_entries, 1); 26 - __type(key, __u32); 27 - __uint(value_size, META_SIZE); 28 - } test_result SEC(".maps"); 29 - 30 21 bool test_pass; 22 + 23 + static const __u8 smac_want[ETH_ALEN] = { 24 + 0x12, 0x34, 0xDE, 0xAD, 0xBE, 0xEF, 25 + }; 26 + 27 + static const __u8 meta_want[META_SIZE] = { 28 + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 29 + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 30 + 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 31 + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 32 + }; 33 + 34 + static bool check_smac(const struct ethhdr *eth) 35 + { 36 + return !__builtin_memcmp(eth->h_source, smac_want, ETH_ALEN); 37 + } 38 + 39 + static bool check_metadata(const char *file, int line, __u8 *meta_have) 40 + { 41 + if (!__builtin_memcmp(meta_have, meta_want, META_SIZE)) 42 + return true; 43 + 44 + bpf_stream_printk(BPF_STREAM_STDERR, 45 + "FAIL:%s:%d: metadata mismatch\n" 46 + " have:\n %pI6\n %pI6\n" 47 + " want:\n %pI6\n %pI6\n", 48 + file, line, 49 + &meta_have[0x00], &meta_have[0x10], 50 + &meta_want[0x00], &meta_want[0x10]); 51 + return false; 52 + } 53 + 54 + #define check_metadata(meta_have) check_metadata(__FILE__, __LINE__, meta_have) 55 + 56 + static bool check_skb_metadata(const char *file, int line, struct __sk_buff *skb) 57 + { 58 + __u8 *data_meta = ctx_ptr(skb, data_meta); 59 + __u8 *data = ctx_ptr(skb, data); 60 + 61 + return data_meta + META_SIZE <= data && (check_metadata)(file, line, data_meta); 62 + } 63 + 64 + #define check_skb_metadata(skb) check_skb_metadata(__FILE__, __LINE__, skb) 31 65 32 66 SEC("tc") 33 67 int ing_cls(struct __sk_buff *ctx) 34 68 { 35 - __u8 *data, *data_meta; 36 - __u32 key = 0; 69 + __u8 *meta_have = ctx_ptr(ctx, data_meta); 70 + __u8 *data = ctx_ptr(ctx, data); 37 71 38 - data_meta = ctx_ptr(ctx, data_meta); 39 - data = ctx_ptr(ctx, data); 72 + if (meta_have + META_SIZE > data) 73 + goto out; 40 74 41 - if (data_meta + META_SIZE > data) 42 - return TC_ACT_SHOT; 75 + if (!check_metadata(meta_have)) 76 + goto out; 43 77 44 - bpf_map_update_elem(&test_result, &key, data_meta, BPF_ANY); 45 - 78 + test_pass = true; 79 + out: 46 80 return TC_ACT_SHOT; 47 81 } 48 82 ··· 85 49 SEC("tc") 86 50 int ing_cls_dynptr_read(struct __sk_buff *ctx) 87 51 { 52 + __u8 meta_have[META_SIZE]; 88 53 struct bpf_dynptr meta; 89 - const __u32 zero = 0; 90 - __u8 *dst; 91 - 92 - dst = bpf_map_lookup_elem(&test_result, &zero); 93 - if (!dst) 94 - return TC_ACT_SHOT; 95 54 96 55 bpf_dynptr_from_skb_meta(ctx, 0, &meta); 97 - bpf_dynptr_read(dst, META_SIZE, &meta, 0, 0); 56 + bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0); 98 57 58 + if (!check_metadata(meta_have)) 59 + goto out; 60 + 61 + test_pass = true; 62 + out: 99 63 return TC_ACT_SHOT; 100 64 } 101 65 ··· 122 86 int ing_cls_dynptr_slice(struct __sk_buff *ctx) 123 87 { 124 88 struct bpf_dynptr meta; 125 - const __u32 zero = 0; 126 - __u8 *dst, *src; 127 - 128 - dst = bpf_map_lookup_elem(&test_result, &zero); 129 - if (!dst) 130 - return TC_ACT_SHOT; 89 + __u8 *meta_have; 131 90 132 91 bpf_dynptr_from_skb_meta(ctx, 0, &meta); 133 - src = bpf_dynptr_slice(&meta, 0, NULL, META_SIZE); 134 - if (!src) 135 - return TC_ACT_SHOT; 92 + meta_have = bpf_dynptr_slice(&meta, 0, NULL, META_SIZE); 93 + if (!meta_have) 94 + goto out; 136 95 137 - __builtin_memcpy(dst, src, META_SIZE); 96 + if (!check_metadata(meta_have)) 97 + goto out; 138 98 99 + test_pass = true; 100 + out: 139 101 return TC_ACT_SHOT; 140 102 } 141 103 ··· 163 129 SEC("tc") 164 130 int ing_cls_dynptr_offset_rd(struct __sk_buff *ctx) 165 131 { 166 - struct bpf_dynptr meta; 167 132 const __u32 chunk_len = META_SIZE / 4; 168 - const __u32 zero = 0; 133 + __u8 meta_have[META_SIZE]; 134 + struct bpf_dynptr meta; 169 135 __u8 *dst, *src; 170 136 171 - dst = bpf_map_lookup_elem(&test_result, &zero); 172 - if (!dst) 173 - return TC_ACT_SHOT; 137 + dst = meta_have; 174 138 175 139 /* 1. Regular read */ 176 140 bpf_dynptr_from_skb_meta(ctx, 0, &meta); ··· 187 155 /* 4. Read from a slice starting at an offset */ 188 156 src = bpf_dynptr_slice(&meta, 2 * chunk_len, NULL, chunk_len); 189 157 if (!src) 190 - return TC_ACT_SHOT; 158 + goto out; 191 159 __builtin_memcpy(dst, src, chunk_len); 192 160 161 + if (!check_metadata(meta_have)) 162 + goto out; 163 + 164 + test_pass = true; 165 + out: 193 166 return TC_ACT_SHOT; 194 167 } 195 168 ··· 291 254 /* Drop any non-test packets */ 292 255 if (eth + 1 > ctx_ptr(ctx, data_end)) 293 256 return XDP_DROP; 294 - if (eth->h_proto != 0) 257 + if (!check_smac(eth)) 295 258 return XDP_DROP; 296 259 297 260 ret = bpf_xdp_adjust_meta(ctx, -META_SIZE); ··· 331 294 332 295 /* The Linux networking stack may send other packets on the test 333 296 * interface that interfere with the test. Just drop them. 334 - * The test packets can be recognized by their ethertype of zero. 297 + * The test packets can be recognized by their source MAC address. 335 298 */ 336 - if (eth->h_proto != 0) 299 + if (!check_smac(eth)) 337 300 return XDP_DROP; 338 301 339 302 __builtin_memcpy(data_meta, payload, META_SIZE); ··· 341 304 } 342 305 343 306 /* 344 - * Check that skb->data_meta..skb->data is empty if prog writes to packet 345 - * _payload_ using packet pointers. Applies only to cloned skbs. 307 + * Check that, when operating on a cloned packet, skb->data_meta..skb->data is 308 + * kept intact if prog writes to packet _payload_ using packet pointers. 346 309 */ 347 310 SEC("tc") 348 - int clone_data_meta_empty_on_data_write(struct __sk_buff *ctx) 311 + int clone_data_meta_survives_data_write(struct __sk_buff *ctx) 349 312 { 313 + __u8 *meta_have = ctx_ptr(ctx, data_meta); 350 314 struct ethhdr *eth = ctx_ptr(ctx, data); 351 315 352 316 if (eth + 1 > ctx_ptr(ctx, data_end)) 353 317 goto out; 354 318 /* Ignore non-test packets */ 355 - if (eth->h_proto != 0) 319 + if (!check_smac(eth)) 356 320 goto out; 357 321 358 - /* Expect no metadata */ 359 - if (ctx->data_meta != ctx->data) 322 + if (meta_have + META_SIZE > eth) 323 + goto out; 324 + 325 + if (!check_metadata(meta_have)) 360 326 goto out; 361 327 362 328 /* Packet write to trigger unclone in prologue */ ··· 371 331 } 372 332 373 333 /* 374 - * Check that skb->data_meta..skb->data is empty if prog writes to packet 375 - * _metadata_ using packet pointers. Applies only to cloned skbs. 334 + * Check that, when operating on a cloned packet, skb->data_meta..skb->data is 335 + * kept intact if prog writes to packet _metadata_ using packet pointers. 376 336 */ 377 337 SEC("tc") 378 - int clone_data_meta_empty_on_meta_write(struct __sk_buff *ctx) 338 + int clone_data_meta_survives_meta_write(struct __sk_buff *ctx) 379 339 { 340 + __u8 *meta_have = ctx_ptr(ctx, data_meta); 380 341 struct ethhdr *eth = ctx_ptr(ctx, data); 381 - __u8 *md = ctx_ptr(ctx, data_meta); 382 342 383 343 if (eth + 1 > ctx_ptr(ctx, data_end)) 384 344 goto out; 385 345 /* Ignore non-test packets */ 386 - if (eth->h_proto != 0) 346 + if (!check_smac(eth)) 387 347 goto out; 388 348 389 - if (md + 1 > ctx_ptr(ctx, data)) { 390 - /* Expect no metadata */ 391 - test_pass = true; 392 - } else { 393 - /* Metadata write to trigger unclone in prologue */ 394 - *md = 42; 395 - } 349 + if (meta_have + META_SIZE > eth) 350 + goto out; 351 + 352 + if (!check_metadata(meta_have)) 353 + goto out; 354 + 355 + /* Metadata write to trigger unclone in prologue */ 356 + *meta_have = 42; 357 + 358 + test_pass = true; 396 359 out: 397 360 return TC_ACT_SHOT; 398 361 } 399 362 400 363 /* 401 - * Check that skb_meta dynptr is writable but empty if prog writes to packet 402 - * _payload_ using a dynptr slice. Applies only to cloned skbs. 364 + * Check that, when operating on a cloned packet, metadata remains intact if 365 + * prog creates a r/w slice to packet _payload_. 403 366 */ 404 367 SEC("tc") 405 - int clone_dynptr_empty_on_data_slice_write(struct __sk_buff *ctx) 368 + int clone_meta_dynptr_survives_data_slice_write(struct __sk_buff *ctx) 406 369 { 407 370 struct bpf_dynptr data, meta; 371 + __u8 meta_have[META_SIZE]; 408 372 struct ethhdr *eth; 409 373 410 374 bpf_dynptr_from_skb(ctx, 0, &data); ··· 416 372 if (!eth) 417 373 goto out; 418 374 /* Ignore non-test packets */ 419 - if (eth->h_proto != 0) 375 + if (!check_smac(eth)) 420 376 goto out; 421 377 422 - /* Expect no metadata */ 423 378 bpf_dynptr_from_skb_meta(ctx, 0, &meta); 424 - if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) > 0) 379 + bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0); 380 + if (!check_metadata(meta_have)) 425 381 goto out; 426 - 427 - /* Packet write to trigger unclone in prologue */ 428 - eth->h_proto = 42; 429 382 430 383 test_pass = true; 431 384 out: ··· 430 389 } 431 390 432 391 /* 433 - * Check that skb_meta dynptr is writable but empty if prog writes to packet 434 - * _metadata_ using a dynptr slice. Applies only to cloned skbs. 392 + * Check that, when operating on a cloned packet, metadata remains intact if 393 + * prog creates an r/w slice to packet _metadata_. 435 394 */ 436 395 SEC("tc") 437 - int clone_dynptr_empty_on_meta_slice_write(struct __sk_buff *ctx) 396 + int clone_meta_dynptr_survives_meta_slice_write(struct __sk_buff *ctx) 438 397 { 439 398 struct bpf_dynptr data, meta; 440 399 const struct ethhdr *eth; 441 - __u8 *md; 400 + __u8 *meta_have; 442 401 443 402 bpf_dynptr_from_skb(ctx, 0, &data); 444 403 eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); 445 404 if (!eth) 446 405 goto out; 447 406 /* Ignore non-test packets */ 448 - if (eth->h_proto != 0) 407 + if (!check_smac(eth)) 449 408 goto out; 450 409 451 - /* Expect no metadata */ 452 410 bpf_dynptr_from_skb_meta(ctx, 0, &meta); 453 - if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) > 0) 411 + meta_have = bpf_dynptr_slice_rdwr(&meta, 0, NULL, META_SIZE); 412 + if (!meta_have) 454 413 goto out; 455 414 456 - /* Metadata write to trigger unclone in prologue */ 457 - bpf_dynptr_from_skb_meta(ctx, 0, &meta); 458 - md = bpf_dynptr_slice_rdwr(&meta, 0, NULL, sizeof(*md)); 459 - if (md) 460 - *md = 42; 415 + if (!check_metadata(meta_have)) 416 + goto out; 461 417 462 418 test_pass = true; 463 419 out: ··· 462 424 } 463 425 464 426 /* 465 - * Check that skb_meta dynptr is read-only before prog writes to packet payload 466 - * using dynptr_write helper. Applies only to cloned skbs. 427 + * Check that, when operating on a cloned packet, skb_meta dynptr is read-write 428 + * before prog writes to packet _payload_ using dynptr_write helper and metadata 429 + * remains intact before and after the write. 467 430 */ 468 431 SEC("tc") 469 - int clone_dynptr_rdonly_before_data_dynptr_write(struct __sk_buff *ctx) 432 + int clone_meta_dynptr_rw_before_data_dynptr_write(struct __sk_buff *ctx) 470 433 { 471 434 struct bpf_dynptr data, meta; 435 + __u8 meta_have[META_SIZE]; 472 436 const struct ethhdr *eth; 437 + int err; 473 438 474 439 bpf_dynptr_from_skb(ctx, 0, &data); 475 440 eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); 476 441 if (!eth) 477 442 goto out; 478 443 /* Ignore non-test packets */ 479 - if (eth->h_proto != 0) 444 + if (!check_smac(eth)) 480 445 goto out; 481 446 482 - /* Expect read-only metadata before unclone */ 447 + /* Expect read-write metadata before unclone */ 483 448 bpf_dynptr_from_skb_meta(ctx, 0, &meta); 484 - if (!bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != META_SIZE) 449 + if (bpf_dynptr_is_rdonly(&meta)) 450 + goto out; 451 + 452 + err = bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0); 453 + if (err || !check_metadata(meta_have)) 485 454 goto out; 486 455 487 456 /* Helper write to payload will unclone the packet */ 488 457 bpf_dynptr_write(&data, offsetof(struct ethhdr, h_proto), "x", 1, 0); 489 458 490 - /* Expect no metadata after unclone */ 491 - bpf_dynptr_from_skb_meta(ctx, 0, &meta); 492 - if (bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != 0) 459 + err = bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0); 460 + if (err || !check_metadata(meta_have)) 493 461 goto out; 494 462 495 463 test_pass = true; ··· 504 460 } 505 461 506 462 /* 507 - * Check that skb_meta dynptr is read-only if prog writes to packet 508 - * metadata using dynptr_write helper. Applies only to cloned skbs. 463 + * Check that, when operating on a cloned packet, skb_meta dynptr is read-write 464 + * before prog writes to packet _metadata_ using dynptr_write helper and 465 + * metadata remains intact before and after the write. 509 466 */ 510 467 SEC("tc") 511 - int clone_dynptr_rdonly_before_meta_dynptr_write(struct __sk_buff *ctx) 468 + int clone_meta_dynptr_rw_before_meta_dynptr_write(struct __sk_buff *ctx) 512 469 { 513 470 struct bpf_dynptr data, meta; 471 + __u8 meta_have[META_SIZE]; 514 472 const struct ethhdr *eth; 473 + int err; 515 474 516 475 bpf_dynptr_from_skb(ctx, 0, &data); 517 476 eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); 518 477 if (!eth) 519 478 goto out; 520 479 /* Ignore non-test packets */ 521 - if (eth->h_proto != 0) 480 + if (!check_smac(eth)) 522 481 goto out; 523 482 524 - /* Expect read-only metadata */ 483 + /* Expect read-write metadata before unclone */ 525 484 bpf_dynptr_from_skb_meta(ctx, 0, &meta); 526 - if (!bpf_dynptr_is_rdonly(&meta) || bpf_dynptr_size(&meta) != META_SIZE) 485 + if (bpf_dynptr_is_rdonly(&meta)) 527 486 goto out; 528 487 529 - /* Metadata write. Expect failure. */ 530 - bpf_dynptr_from_skb_meta(ctx, 0, &meta); 531 - if (bpf_dynptr_write(&meta, 0, "x", 1, 0) != -EINVAL) 488 + err = bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0); 489 + if (err || !check_metadata(meta_have)) 490 + goto out; 491 + 492 + /* Helper write to metadata will unclone the packet */ 493 + bpf_dynptr_write(&meta, 0, &meta_have[0], 1, 0); 494 + 495 + err = bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0); 496 + if (err || !check_metadata(meta_have)) 497 + goto out; 498 + 499 + test_pass = true; 500 + out: 501 + return TC_ACT_SHOT; 502 + } 503 + 504 + SEC("tc") 505 + int helper_skb_vlan_push_pop(struct __sk_buff *ctx) 506 + { 507 + int err; 508 + 509 + /* bpf_skb_vlan_push assumes HW offload for primary VLAN tag. Only 510 + * secondary tag push triggers an actual MAC header modification. 511 + */ 512 + err = bpf_skb_vlan_push(ctx, 0, 42); 513 + if (err) 514 + goto out; 515 + err = bpf_skb_vlan_push(ctx, 0, 207); 516 + if (err) 517 + goto out; 518 + 519 + if (!check_skb_metadata(ctx)) 520 + goto out; 521 + 522 + err = bpf_skb_vlan_pop(ctx); 523 + if (err) 524 + goto out; 525 + err = bpf_skb_vlan_pop(ctx); 526 + if (err) 527 + goto out; 528 + 529 + if (!check_skb_metadata(ctx)) 530 + goto out; 531 + 532 + test_pass = true; 533 + out: 534 + return TC_ACT_SHOT; 535 + } 536 + 537 + SEC("tc") 538 + int helper_skb_adjust_room(struct __sk_buff *ctx) 539 + { 540 + int err; 541 + 542 + /* Grow a 1 byte hole after the MAC header */ 543 + err = bpf_skb_adjust_room(ctx, 1, BPF_ADJ_ROOM_MAC, 0); 544 + if (err) 545 + goto out; 546 + 547 + if (!check_skb_metadata(ctx)) 548 + goto out; 549 + 550 + /* Shrink a 1 byte hole after the MAC header */ 551 + err = bpf_skb_adjust_room(ctx, -1, BPF_ADJ_ROOM_MAC, 0); 552 + if (err) 553 + goto out; 554 + 555 + if (!check_skb_metadata(ctx)) 556 + goto out; 557 + 558 + /* Grow a 256 byte hole to trigger head reallocation */ 559 + err = bpf_skb_adjust_room(ctx, 256, BPF_ADJ_ROOM_MAC, 0); 560 + if (err) 561 + goto out; 562 + 563 + if (!check_skb_metadata(ctx)) 564 + goto out; 565 + 566 + test_pass = true; 567 + out: 568 + return TC_ACT_SHOT; 569 + } 570 + 571 + SEC("tc") 572 + int helper_skb_change_head_tail(struct __sk_buff *ctx) 573 + { 574 + int err; 575 + 576 + /* Reserve 1 extra in the front for packet data */ 577 + err = bpf_skb_change_head(ctx, 1, 0); 578 + if (err) 579 + goto out; 580 + 581 + if (!check_skb_metadata(ctx)) 582 + goto out; 583 + 584 + /* Reserve 256 extra bytes in the front to trigger head reallocation */ 585 + err = bpf_skb_change_head(ctx, 256, 0); 586 + if (err) 587 + goto out; 588 + 589 + if (!check_skb_metadata(ctx)) 590 + goto out; 591 + 592 + /* Reserve 4k extra bytes in the back to trigger head reallocation */ 593 + err = bpf_skb_change_tail(ctx, ctx->len + 4096, 0); 594 + if (err) 595 + goto out; 596 + 597 + if (!check_skb_metadata(ctx)) 598 + goto out; 599 + 600 + test_pass = true; 601 + out: 602 + return TC_ACT_SHOT; 603 + } 604 + 605 + SEC("tc") 606 + int helper_skb_change_proto(struct __sk_buff *ctx) 607 + { 608 + int err; 609 + 610 + err = bpf_skb_change_proto(ctx, bpf_htons(ETH_P_IPV6), 0); 611 + if (err) 612 + goto out; 613 + 614 + if (!check_skb_metadata(ctx)) 615 + goto out; 616 + 617 + err = bpf_skb_change_proto(ctx, bpf_htons(ETH_P_IP), 0); 618 + if (err) 619 + goto out; 620 + 621 + if (!check_skb_metadata(ctx)) 532 622 goto out; 533 623 534 624 test_pass = true;