Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

net/smc: bpf: Introduce generic hook for handshake flow

The introduction of IPPROTO_SMC enables eBPF programs to determine
whether to use SMC based on the context of socket creation, such as
network namespaces, PID and comm name, etc.

As a subsequent enhancement, to introduce a new generic hook that
allows decisions on whether to use SMC or not at runtime, including
but not limited to local/remote IP address or ports.

User can write their own implememtion via bpf_struct_ops now to choose
whether to use SMC or not before TCP 3rd handshake to be comleted.

Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Link: https://patch.msgid.link/20251107035632.115950-3-alibuda@linux.alibaba.com

authored by

D. Wythe and committed by
Martin KaFai Lau
15f295f5 07c428ec

+355 -14
+3
include/net/netns/smc.h
··· 17 17 #ifdef CONFIG_SYSCTL 18 18 struct ctl_table_header *smc_hdr; 19 19 #endif 20 + #if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) 21 + struct smc_hs_ctrl __rcu *hs_ctrl; 22 + #endif /* CONFIG_SMC_HS_CTRL_BPF */ 20 23 unsigned int sysctl_autocorking_size; 21 24 unsigned int sysctl_smcr_buf_type; 22 25 int sysctl_smcr_testlink_time;
+53
include/net/smc.h
··· 17 17 #include <linux/wait.h> 18 18 #include <linux/dibs.h> 19 19 20 + struct tcp_sock; 21 + struct inet_request_sock; 20 22 struct sock; 21 23 22 24 #define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ ··· 51 49 wait_queue_head_t lgrs_deleted; 52 50 u8 going_away : 1; 53 51 }; 52 + 53 + #define SMC_HS_CTRL_NAME_MAX 16 54 + 55 + enum { 56 + /* ops can be inherit from init_net */ 57 + SMC_HS_CTRL_FLAG_INHERITABLE = 0x1, 58 + 59 + SMC_HS_CTRL_ALL_FLAGS = SMC_HS_CTRL_FLAG_INHERITABLE, 60 + }; 61 + 62 + struct smc_hs_ctrl { 63 + /* private */ 64 + 65 + struct list_head list; 66 + struct module *owner; 67 + 68 + /* public */ 69 + 70 + /* unique name */ 71 + char name[SMC_HS_CTRL_NAME_MAX]; 72 + int flags; 73 + 74 + /* Invoked before computing SMC option for SYN packets. 75 + * We can control whether to set SMC options by returning various value. 76 + * Return 0 to disable SMC, or return any other value to enable it. 77 + */ 78 + int (*syn_option)(struct tcp_sock *tp); 79 + 80 + /* Invoked before Set up SMC options for SYN-ACK packets 81 + * We can control whether to respond SMC options by returning various 82 + * value. Return 0 to disable SMC, or return any other value to enable 83 + * it. 84 + */ 85 + int (*synack_option)(const struct tcp_sock *tp, 86 + struct inet_request_sock *ireq); 87 + }; 88 + 89 + #if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) 90 + #define smc_call_hsbpf(init_val, tp, func, ...) ({ \ 91 + typeof(init_val) __ret = (init_val); \ 92 + struct smc_hs_ctrl *ctrl; \ 93 + rcu_read_lock(); \ 94 + ctrl = rcu_dereference(sock_net((struct sock *)(tp))->smc.hs_ctrl); \ 95 + if (ctrl && ctrl->func) \ 96 + __ret = ctrl->func(tp, ##__VA_ARGS__); \ 97 + rcu_read_unlock(); \ 98 + __ret; \ 99 + }) 100 + #else 101 + #define smc_call_hsbpf(init_val, tp, ...) ({ (void)(tp); (init_val); }) 102 + #endif /* CONFIG_SMC_HS_CTRL_BPF */ 54 103 55 104 #endif /* _SMC_H */
+17 -14
net/ipv4/tcp_output.c
··· 40 40 #include <net/tcp.h> 41 41 #include <net/tcp_ecn.h> 42 42 #include <net/mptcp.h> 43 + #include <net/smc.h> 43 44 #include <net/proto_memory.h> 44 45 #include <net/psp.h> 45 46 ··· 803 802 mptcp_options_write(th, ptr, tp, opts); 804 803 } 805 804 806 - static void smc_set_option(const struct tcp_sock *tp, 805 + static void smc_set_option(struct tcp_sock *tp, 807 806 struct tcp_out_options *opts, 808 807 unsigned int *remaining) 809 808 { 810 809 #if IS_ENABLED(CONFIG_SMC) 811 - if (static_branch_unlikely(&tcp_have_smc)) { 812 - if (tp->syn_smc) { 813 - if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { 814 - opts->options |= OPTION_SMC; 815 - *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; 816 - } 810 + if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc) { 811 + tp->syn_smc = !!smc_call_hsbpf(1, tp, syn_option); 812 + /* re-check syn_smc */ 813 + if (tp->syn_smc && 814 + *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { 815 + opts->options |= OPTION_SMC; 816 + *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; 817 817 } 818 818 } 819 819 #endif 820 820 } 821 821 822 822 static void smc_set_option_cond(const struct tcp_sock *tp, 823 - const struct inet_request_sock *ireq, 823 + struct inet_request_sock *ireq, 824 824 struct tcp_out_options *opts, 825 825 unsigned int *remaining) 826 826 { 827 827 #if IS_ENABLED(CONFIG_SMC) 828 - if (static_branch_unlikely(&tcp_have_smc)) { 829 - if (tp->syn_smc && ireq->smc_ok) { 830 - if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { 831 - opts->options |= OPTION_SMC; 832 - *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; 833 - } 828 + if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc && ireq->smc_ok) { 829 + ireq->smc_ok = !!smc_call_hsbpf(1, tp, synack_option, ireq); 830 + /* re-check smc_ok */ 831 + if (ireq->smc_ok && 832 + *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { 833 + opts->options |= OPTION_SMC; 834 + *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; 834 835 } 835 836 } 836 837 #endif
+10
net/smc/Kconfig
··· 19 19 smcss. 20 20 21 21 if unsure, say Y. 22 + 23 + config SMC_HS_CTRL_BPF 24 + bool "Generic eBPF hook for SMC handshake flow" 25 + depends on SMC && BPF_SYSCALL 26 + default y 27 + help 28 + SMC_HS_CTRL_BPF enables support to register generic eBPF hook for SMC 29 + handshake flow, which offer much greater flexibility in modifying the behavior 30 + of the SMC protocol stack compared to a complete kernel-based approach. Select 31 + this option if you want filtring the handshake process via eBPF programs.
+1
net/smc/Makefile
··· 6 6 smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o 7 7 smc-y += smc_tracepoint.o smc_inet.o 8 8 smc-$(CONFIG_SYSCTL) += smc_sysctl.o 9 + smc-$(CONFIG_SMC_HS_CTRL_BPF) += smc_hs_bpf.o
+9
net/smc/af_smc.c
··· 58 58 #include "smc_tracepoint.h" 59 59 #include "smc_sysctl.h" 60 60 #include "smc_inet.h" 61 + #include "smc_hs_bpf.h" 61 62 62 63 static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group 63 64 * creation on server ··· 3601 3600 pr_err("%s: smc_inet_init fails with %d\n", __func__, rc); 3602 3601 goto out_ulp; 3603 3602 } 3603 + rc = bpf_smc_hs_ctrl_init(); 3604 + if (rc) { 3605 + pr_err("%s: bpf_smc_hs_ctrl_init fails with %d\n", __func__, 3606 + rc); 3607 + goto out_inet; 3608 + } 3604 3609 static_branch_enable(&tcp_have_smc); 3605 3610 return 0; 3611 + out_inet: 3612 + smc_inet_exit(); 3606 3613 out_ulp: 3607 3614 tcp_unregister_ulp(&smc_ulp_ops); 3608 3615 out_ib:
+140
net/smc/smc_hs_bpf.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Shared Memory Communications over RDMA (SMC-R) and RoCE 4 + * 5 + * Generic hook for SMC handshake flow. 6 + * 7 + * Copyright IBM Corp. 2016 8 + * Copyright (c) 2025, Alibaba Inc. 9 + * 10 + * Author: D. Wythe <alibuda@linux.alibaba.com> 11 + */ 12 + 13 + #include <linux/bpf_verifier.h> 14 + #include <linux/bpf.h> 15 + #include <linux/btf.h> 16 + #include <linux/rculist.h> 17 + 18 + #include "smc_hs_bpf.h" 19 + 20 + static DEFINE_SPINLOCK(smc_hs_ctrl_list_lock); 21 + static LIST_HEAD(smc_hs_ctrl_list); 22 + 23 + static int smc_hs_ctrl_reg(struct smc_hs_ctrl *ctrl) 24 + { 25 + int ret = 0; 26 + 27 + spin_lock(&smc_hs_ctrl_list_lock); 28 + /* already exist or duplicate name */ 29 + if (smc_hs_ctrl_find_by_name(ctrl->name)) 30 + ret = -EEXIST; 31 + else 32 + list_add_tail_rcu(&ctrl->list, &smc_hs_ctrl_list); 33 + spin_unlock(&smc_hs_ctrl_list_lock); 34 + return ret; 35 + } 36 + 37 + static void smc_hs_ctrl_unreg(struct smc_hs_ctrl *ctrl) 38 + { 39 + spin_lock(&smc_hs_ctrl_list_lock); 40 + list_del_rcu(&ctrl->list); 41 + spin_unlock(&smc_hs_ctrl_list_lock); 42 + 43 + /* Ensure that all readers to complete */ 44 + synchronize_rcu(); 45 + } 46 + 47 + struct smc_hs_ctrl *smc_hs_ctrl_find_by_name(const char *name) 48 + { 49 + struct smc_hs_ctrl *ctrl; 50 + 51 + list_for_each_entry_rcu(ctrl, &smc_hs_ctrl_list, list) { 52 + if (strcmp(ctrl->name, name) == 0) 53 + return ctrl; 54 + } 55 + return NULL; 56 + } 57 + 58 + static int __smc_bpf_stub_set_tcp_option(struct tcp_sock *tp) { return 1; } 59 + static int __smc_bpf_stub_set_tcp_option_cond(const struct tcp_sock *tp, 60 + struct inet_request_sock *ireq) 61 + { 62 + return 1; 63 + } 64 + 65 + static struct smc_hs_ctrl __smc_bpf_hs_ctrl = { 66 + .syn_option = __smc_bpf_stub_set_tcp_option, 67 + .synack_option = __smc_bpf_stub_set_tcp_option_cond, 68 + }; 69 + 70 + static int smc_bpf_hs_ctrl_init(struct btf *btf) { return 0; } 71 + 72 + static int smc_bpf_hs_ctrl_reg(void *kdata, struct bpf_link *link) 73 + { 74 + if (link) 75 + return -EOPNOTSUPP; 76 + 77 + return smc_hs_ctrl_reg(kdata); 78 + } 79 + 80 + static void smc_bpf_hs_ctrl_unreg(void *kdata, struct bpf_link *link) 81 + { 82 + smc_hs_ctrl_unreg(kdata); 83 + } 84 + 85 + static int smc_bpf_hs_ctrl_init_member(const struct btf_type *t, 86 + const struct btf_member *member, 87 + void *kdata, const void *udata) 88 + { 89 + const struct smc_hs_ctrl *u_ctrl; 90 + struct smc_hs_ctrl *k_ctrl; 91 + u32 moff; 92 + 93 + u_ctrl = (const struct smc_hs_ctrl *)udata; 94 + k_ctrl = (struct smc_hs_ctrl *)kdata; 95 + 96 + moff = __btf_member_bit_offset(t, member) / 8; 97 + switch (moff) { 98 + case offsetof(struct smc_hs_ctrl, name): 99 + if (bpf_obj_name_cpy(k_ctrl->name, u_ctrl->name, 100 + sizeof(u_ctrl->name)) <= 0) 101 + return -EINVAL; 102 + return 1; 103 + case offsetof(struct smc_hs_ctrl, flags): 104 + if (u_ctrl->flags & ~SMC_HS_CTRL_ALL_FLAGS) 105 + return -EINVAL; 106 + k_ctrl->flags = u_ctrl->flags; 107 + return 1; 108 + default: 109 + break; 110 + } 111 + 112 + return 0; 113 + } 114 + 115 + static const struct bpf_func_proto * 116 + bpf_smc_hs_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 117 + { 118 + return bpf_base_func_proto(func_id, prog); 119 + } 120 + 121 + static const struct bpf_verifier_ops smc_bpf_verifier_ops = { 122 + .get_func_proto = bpf_smc_hs_func_proto, 123 + .is_valid_access = bpf_tracing_btf_ctx_access, 124 + }; 125 + 126 + static struct bpf_struct_ops bpf_smc_hs_ctrl_ops = { 127 + .name = "smc_hs_ctrl", 128 + .init = smc_bpf_hs_ctrl_init, 129 + .reg = smc_bpf_hs_ctrl_reg, 130 + .unreg = smc_bpf_hs_ctrl_unreg, 131 + .cfi_stubs = &__smc_bpf_hs_ctrl, 132 + .verifier_ops = &smc_bpf_verifier_ops, 133 + .init_member = smc_bpf_hs_ctrl_init_member, 134 + .owner = THIS_MODULE, 135 + }; 136 + 137 + int bpf_smc_hs_ctrl_init(void) 138 + { 139 + return register_bpf_struct_ops(&bpf_smc_hs_ctrl_ops, smc_hs_ctrl); 140 + }
+31
net/smc/smc_hs_bpf.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Shared Memory Communications over RDMA (SMC-R) and RoCE 4 + * 5 + * Generic hook for SMC handshake flow. 6 + * 7 + * Copyright IBM Corp. 2016 8 + * Copyright (c) 2025, Alibaba Inc. 9 + * 10 + * Author: D. Wythe <alibuda@linux.alibaba.com> 11 + */ 12 + 13 + #ifndef __SMC_HS_CTRL 14 + #define __SMC_HS_CTRL 15 + 16 + #include <net/smc.h> 17 + 18 + /* Find hs_ctrl by the target name, which required to be a c-string. 19 + * Return NULL if no such ctrl was found,otherwise, return a valid ctrl. 20 + * 21 + * Note: Caller MUST ensure it's was invoked under rcu_read_lock. 22 + */ 23 + struct smc_hs_ctrl *smc_hs_ctrl_find_by_name(const char *name); 24 + 25 + #if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) 26 + int bpf_smc_hs_ctrl_init(void); 27 + #else 28 + static inline int bpf_smc_hs_ctrl_init(void) { return 0; } 29 + #endif /* CONFIG_SMC_HS_CTRL_BPF */ 30 + 31 + #endif /* __SMC_HS_CTRL */
+91
net/smc/smc_sysctl.c
··· 12 12 13 13 #include <linux/init.h> 14 14 #include <linux/sysctl.h> 15 + #include <linux/bpf.h> 15 16 #include <net/net_namespace.h> 16 17 17 18 #include "smc.h" 18 19 #include "smc_core.h" 19 20 #include "smc_llc.h" 20 21 #include "smc_sysctl.h" 22 + #include "smc_hs_bpf.h" 21 23 22 24 static int min_sndbuf = SMC_BUF_MIN_SIZE; 23 25 static int min_rcvbuf = SMC_BUF_MIN_SIZE; ··· 33 31 static int conns_per_lgr_max = SMC_CONN_PER_LGR_MAX; 34 32 static unsigned int smcr_max_wr_min = 2; 35 33 static unsigned int smcr_max_wr_max = 2048; 34 + 35 + #if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) 36 + static int smc_net_replace_smc_hs_ctrl(struct net *net, const char *name) 37 + { 38 + struct smc_hs_ctrl *ctrl = NULL; 39 + 40 + rcu_read_lock(); 41 + /* null or empty name ask to clear current ctrl */ 42 + if (name && name[0]) { 43 + ctrl = smc_hs_ctrl_find_by_name(name); 44 + if (!ctrl) { 45 + rcu_read_unlock(); 46 + return -EINVAL; 47 + } 48 + /* no change, just return */ 49 + if (ctrl == rcu_dereference(net->smc.hs_ctrl)) { 50 + rcu_read_unlock(); 51 + return 0; 52 + } 53 + if (!bpf_try_module_get(ctrl, ctrl->owner)) { 54 + rcu_read_unlock(); 55 + return -EBUSY; 56 + } 57 + } 58 + /* xhcg old ctrl with the new one atomically */ 59 + ctrl = unrcu_pointer(xchg(&net->smc.hs_ctrl, RCU_INITIALIZER(ctrl))); 60 + /* release old ctrl */ 61 + if (ctrl) 62 + bpf_module_put(ctrl, ctrl->owner); 63 + 64 + rcu_read_unlock(); 65 + return 0; 66 + } 67 + 68 + static int proc_smc_hs_ctrl(const struct ctl_table *ctl, int write, 69 + void *buffer, size_t *lenp, loff_t *ppos) 70 + { 71 + struct net *net = container_of(ctl->data, struct net, smc.hs_ctrl); 72 + char val[SMC_HS_CTRL_NAME_MAX]; 73 + const struct ctl_table tbl = { 74 + .data = val, 75 + .maxlen = SMC_HS_CTRL_NAME_MAX, 76 + }; 77 + struct smc_hs_ctrl *ctrl; 78 + int ret; 79 + 80 + rcu_read_lock(); 81 + ctrl = rcu_dereference(net->smc.hs_ctrl); 82 + if (ctrl) 83 + memcpy(val, ctrl->name, sizeof(ctrl->name)); 84 + else 85 + val[0] = '\0'; 86 + rcu_read_unlock(); 87 + 88 + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); 89 + if (ret) 90 + return ret; 91 + 92 + if (write) 93 + ret = smc_net_replace_smc_hs_ctrl(net, val); 94 + return ret; 95 + } 96 + #endif /* CONFIG_SMC_HS_CTRL_BPF */ 36 97 37 98 static struct ctl_table smc_table[] = { 38 99 { ··· 184 119 .extra1 = &smcr_max_wr_min, 185 120 .extra2 = &smcr_max_wr_max, 186 121 }, 122 + #if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) 123 + { 124 + .procname = "hs_ctrl", 125 + .data = &init_net.smc.hs_ctrl, 126 + .mode = 0644, 127 + .maxlen = SMC_HS_CTRL_NAME_MAX, 128 + .proc_handler = proc_smc_hs_ctrl, 129 + }, 130 + #endif /* CONFIG_SMC_HS_CTRL_BPF */ 187 131 }; 188 132 189 133 int __net_init smc_sysctl_net_init(struct net *net) ··· 203 129 table = smc_table; 204 130 if (!net_eq(net, &init_net)) { 205 131 int i; 132 + #if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) 133 + struct smc_hs_ctrl *ctrl; 134 + 135 + rcu_read_lock(); 136 + ctrl = rcu_dereference(init_net.smc.hs_ctrl); 137 + if (ctrl && ctrl->flags & SMC_HS_CTRL_FLAG_INHERITABLE && 138 + bpf_try_module_get(ctrl, ctrl->owner)) 139 + rcu_assign_pointer(net->smc.hs_ctrl, ctrl); 140 + rcu_read_unlock(); 141 + #endif /* CONFIG_SMC_HS_CTRL_BPF */ 206 142 207 143 table = kmemdup(table, sizeof(smc_table), GFP_KERNEL); 208 144 if (!table) ··· 245 161 if (!net_eq(net, &init_net)) 246 162 kfree(table); 247 163 err_alloc: 164 + #if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) 165 + smc_net_replace_smc_hs_ctrl(net, NULL); 166 + #endif /* CONFIG_SMC_HS_CTRL_BPF */ 248 167 return -ENOMEM; 249 168 } 250 169 ··· 257 170 258 171 table = net->smc.smc_hdr->ctl_table_arg; 259 172 unregister_net_sysctl_table(net->smc.smc_hdr); 173 + #if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) 174 + smc_net_replace_smc_hs_ctrl(net, NULL); 175 + #endif /* CONFIG_SMC_HS_CTRL_BPF */ 176 + 260 177 if (!net_eq(net, &init_net)) 261 178 kfree(table); 262 179 }