Merge branch 'net-cover-more-per-cpu-storage-with-local-nested-bh-locking'

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge branch 'net-cover-more-per-cpu-storage-with-local-nested-bh-locking'

Sebastian Andrzej Siewior says:

====================
net: Cover more per-CPU storage with local nested BH locking

I was looking at the build-time defined per-CPU variables in net/ and
added the needed local-BH-locks in order to be able to remove the
current per-CPU lock in local_bh_disable() on PREMPT_RT.

The work is not yet complete, I just wanted to post what I have so far
instead of sitting on it.

v3: https://lore.kernel.org/all/20250430124758.1159480-1-bigeasy@linutronix.de/
v2: https://lore.kernel.org/all/20250414160754.503321-1-bigeasy@linutronix.de
v1: https://lore.kernel.org/all/20250309144653.825351-1-bigeasy@linutronix.de
====================

Link: https://patch.msgid.link/20250512092736.229935-1-bigeasy@linutronix.de
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

Paolo Abeni 1 year ago 4c032725 67fa7564

+241 -127

17 changed files

expand all collapse all

include

linux

netdevice.h

netdevice_xmit.h

net

core

dev.c

dst_cache.c

page_pool.c

xdp.c

ipv4

route.c

ipv6

seg6_hmac.c

mptcp

protocol.c

protocol.h

openvswitch

actions.c

datapath.c

datapath.h

rds

page.c

sched

act_mirred.c

sch_frag.c

xfrm

xfrm_nat_keepalive.c

+6 -1

include/linux/netdevice.h

reviewed

··· 3503 3503 }; 3504 3504 3505 3505 DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); 3506 3506 - DECLARE_PER_CPU(struct page_pool *, system_page_pool); 3506 3506 + 3507 3507 + struct page_pool_bh { 3508 3508 + struct page_pool *pool; 3509 3509 + local_lock_t bh_lock; 3510 3510 + }; 3511 3511 + DECLARE_PER_CPU(struct page_pool_bh, system_page_pool); 3507 3512 3508 3513 #ifndef CONFIG_PREEMPT_RT 3509 3514 static inline int dev_recursion_level(void)

include/linux/netdevice_xmit.h

reviewed

··· 8 8 #ifdef CONFIG_NET_EGRESS 9 9 u8 skip_txqueue; 10 10 #endif 11 11 + #if IS_ENABLED(CONFIG_NET_ACT_MIRRED) 12 12 + u8 sched_mirred_nest; 13 13 + #endif 11 14 }; 12 15 13 16 #endif

+10 -5

net/core/dev.c

reviewed

··· 462 462 * PP consumers must pay attention to run APIs in the appropriate context 463 463 * (e.g. NAPI context). 464 464 */ 465 465 - DEFINE_PER_CPU(struct page_pool *, system_page_pool); 465 465 + DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = { 466 466 + .bh_lock = INIT_LOCAL_LOCK(bh_lock), 467 467 + }; 466 468 467 469 #ifdef CONFIG_LOCKDEP 468 470 /* ··· 5324 5322 struct sk_buff *skb = *pskb; 5325 5323 int err, hroom, troom; 5326 5324 5327 5327 - if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog)) 5325 5325 + local_lock_nested_bh(&system_page_pool.bh_lock); 5326 5326 + err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog); 5327 5327 + local_unlock_nested_bh(&system_page_pool.bh_lock); 5328 5328 + if (!err) 5328 5329 return 0; 5329 5330 5330 5331 /* In case we have to go down the path and also linearize, ··· 12717 12712 return err; 12718 12713 } 12719 12714 12720 12720 - per_cpu(system_page_pool, cpuid) = pp_ptr; 12715 12715 + per_cpu(system_page_pool.pool, cpuid) = pp_ptr; 12721 12716 #endif 12722 12717 return 0; 12723 12718 } ··· 12847 12842 for_each_possible_cpu(i) { 12848 12843 struct page_pool *pp_ptr; 12849 12844 12850 12850 - pp_ptr = per_cpu(system_page_pool, i); 12845 12845 + pp_ptr = per_cpu(system_page_pool.pool, i); 12851 12846 if (!pp_ptr) 12852 12847 continue; 12853 12848 12854 12849 xdp_unreg_page_pool(pp_ptr); 12855 12850 page_pool_destroy(pp_ptr); 12856 12856 - per_cpu(system_page_pool, i) = NULL; 12851 12851 + per_cpu(system_page_pool.pool, i) = NULL; 12857 12852 } 12858 12853 } 12859 12854

+27 -3

net/core/dst_cache.c

reviewed

··· 17 17 struct dst_cache_pcpu { 18 18 unsigned long refresh_ts; 19 19 struct dst_entry *dst; 20 20 + local_lock_t bh_lock; 20 21 u32 cookie; 21 22 union { 22 23 struct in_addr in_saddr; ··· 66 65 67 66 struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) 68 67 { 68 68 + struct dst_entry *dst; 69 69 + 69 70 if (!dst_cache->cache) 70 71 return NULL; 71 72 72 72 - return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); 73 73 + local_lock_nested_bh(&dst_cache->cache->bh_lock); 74 74 + dst = dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); 75 75 + local_unlock_nested_bh(&dst_cache->cache->bh_lock); 76 76 + return dst; 73 77 } 74 78 EXPORT_SYMBOL_GPL(dst_cache_get); 75 79 ··· 86 80 if (!dst_cache->cache) 87 81 return NULL; 88 82 83 83 + local_lock_nested_bh(&dst_cache->cache->bh_lock); 89 84 idst = this_cpu_ptr(dst_cache->cache); 90 85 dst = dst_cache_per_cpu_get(dst_cache, idst); 91 91 - if (!dst) 86 86 + if (!dst) { 87 87 + local_unlock_nested_bh(&dst_cache->cache->bh_lock); 92 88 return NULL; 89 89 + } 93 90 94 91 *saddr = idst->in_saddr.s_addr; 92 92 + local_unlock_nested_bh(&dst_cache->cache->bh_lock); 95 93 return dst_rtable(dst); 96 94 } 97 95 EXPORT_SYMBOL_GPL(dst_cache_get_ip4); ··· 108 98 if (!dst_cache->cache) 109 99 return; 110 100 101 101 + local_lock_nested_bh(&dst_cache->cache->bh_lock); 111 102 idst = this_cpu_ptr(dst_cache->cache); 112 103 dst_cache_per_cpu_dst_set(idst, dst, 0); 113 104 idst->in_saddr.s_addr = saddr; 105 105 + local_unlock_nested_bh(&dst_cache->cache->bh_lock); 114 106 } 115 107 EXPORT_SYMBOL_GPL(dst_cache_set_ip4); 116 108 ··· 125 113 if (!dst_cache->cache) 126 114 return; 127 115 116 116 + local_lock_nested_bh(&dst_cache->cache->bh_lock); 117 117 + 128 118 idst = this_cpu_ptr(dst_cache->cache); 129 119 dst_cache_per_cpu_dst_set(idst, dst, 130 120 rt6_get_cookie(dst_rt6_info(dst))); 131 121 idst->in6_saddr = *saddr; 122 122 + local_unlock_nested_bh(&dst_cache->cache->bh_lock); 132 123 } 133 124 EXPORT_SYMBOL_GPL(dst_cache_set_ip6); 134 125 ··· 144 129 if (!dst_cache->cache) 145 130 return NULL; 146 131 132 132 + local_lock_nested_bh(&dst_cache->cache->bh_lock); 133 133 + 147 134 idst = this_cpu_ptr(dst_cache->cache); 148 135 dst = dst_cache_per_cpu_get(dst_cache, idst); 149 149 - if (!dst) 136 136 + if (!dst) { 137 137 + local_unlock_nested_bh(&dst_cache->cache->bh_lock); 150 138 return NULL; 139 139 + } 151 140 152 141 *saddr = idst->in6_saddr; 142 142 + local_unlock_nested_bh(&dst_cache->cache->bh_lock); 153 143 return dst; 154 144 } 155 145 EXPORT_SYMBOL_GPL(dst_cache_get_ip6); ··· 162 142 163 143 int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) 164 144 { 145 145 + unsigned int i; 146 146 + 165 147 dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, 166 148 gfp | __GFP_ZERO); 167 149 if (!dst_cache->cache) 168 150 return -ENOMEM; 151 151 + for_each_possible_cpu(i) 152 152 + local_lock_init(&per_cpu_ptr(dst_cache->cache, i)->bh_lock); 169 153 170 154 dst_cache_reset(dst_cache); 171 155 return 0;

net/core/page_pool.c

reviewed

··· 839 839 const struct napi_struct *napi; 840 840 u32 cpuid; 841 841 842 842 + /* On PREEMPT_RT the softirq can be preempted by the consumer */ 843 843 + if (IS_ENABLED(CONFIG_PREEMPT_RT)) 844 844 + return false; 845 845 + 842 846 if (unlikely(!in_softirq())) 843 847 return false; 844 848

+10 -5

net/core/xdp.c

reviewed

··· 739 739 */ 740 740 struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) 741 741 { 742 742 - struct page_pool *pp = this_cpu_read(system_page_pool); 743 742 const struct xdp_rxq_info *rxq = xdp->rxq; 744 743 u32 len = xdp->data_end - xdp->data_meta; 745 744 u32 truesize = xdp->frame_sz; 746 746 - struct sk_buff *skb; 745 745 + struct sk_buff *skb = NULL; 746 746 + struct page_pool *pp; 747 747 int metalen; 748 748 void *data; 749 749 750 750 if (!IS_ENABLED(CONFIG_PAGE_POOL)) 751 751 return NULL; 752 752 753 753 + local_lock_nested_bh(&system_page_pool.bh_lock); 754 754 + pp = this_cpu_read(system_page_pool.pool); 753 755 data = page_pool_dev_alloc_va(pp, &truesize); 754 756 if (unlikely(!data)) 755 755 - return NULL; 757 757 + goto out; 756 758 757 759 skb = napi_build_skb(data, truesize); 758 760 if (unlikely(!skb)) { 759 761 page_pool_free_va(pp, data, true); 760 760 - return NULL; 762 762 + goto out; 761 763 } 762 764 763 765 skb_mark_for_recycle(skb); ··· 778 776 if (unlikely(xdp_buff_has_frags(xdp)) && 779 777 unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) { 780 778 napi_consume_skb(skb, true); 781 781 - return NULL; 779 779 + skb = NULL; 780 780 + goto out; 782 781 } 783 782 784 783 xsk_buff_free(xdp); 785 784 786 785 skb->protocol = eth_type_trans(skb, rxq->dev); 787 786 787 787 + out: 788 788 + local_unlock_nested_bh(&system_page_pool.bh_lock); 788 789 return skb; 789 790 } 790 791 EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc);

net/ipv4/route.c

reviewed

··· 189 189 EXPORT_SYMBOL(ip_tos2prio); 190 190 191 191 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 192 192 + #ifndef CONFIG_PREEMPT_RT 192 193 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) 194 194 + #else 195 195 + #define RT_CACHE_STAT_INC(field) this_cpu_inc(rt_cache_stat.field) 196 196 + #endif 193 197 194 198 #ifdef CONFIG_PROC_FS 195 199 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)

+11 -2

net/ipv6/seg6_hmac.c

reviewed

··· 40 40 #include <net/seg6_hmac.h> 41 41 #include <linux/random.h> 42 42 43 43 - static DEFINE_PER_CPU(char [SEG6_HMAC_RING_SIZE], hmac_ring); 43 43 + struct hmac_storage { 44 44 + local_lock_t bh_lock; 45 45 + char hmac_ring[SEG6_HMAC_RING_SIZE]; 46 46 + }; 47 47 + 48 48 + static DEFINE_PER_CPU(struct hmac_storage, hmac_storage) = { 49 49 + .bh_lock = INIT_LOCAL_LOCK(bh_lock), 50 50 + }; 44 51 45 52 static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) 46 53 { ··· 194 187 */ 195 188 196 189 local_bh_disable(); 197 197 - ring = this_cpu_ptr(hmac_ring); 190 190 + local_lock_nested_bh(&hmac_storage.bh_lock); 191 191 + ring = this_cpu_ptr(hmac_storage.hmac_ring); 198 192 off = ring; 199 193 200 194 /* source address */ ··· 220 212 221 213 dgsize = __do_hmac(hinfo, ring, plen, tmp_out, 222 214 SEG6_HMAC_MAX_DIGESTSIZE); 215 215 + local_unlock_nested_bh(&hmac_storage.bh_lock); 223 216 local_bh_enable(); 224 217 225 218 if (dgsize < 0)

+3 -1

net/mptcp/protocol.c

reviewed

··· 46 46 static void __mptcp_destroy_sock(struct sock *sk); 47 47 static void mptcp_check_send_data_fin(struct sock *sk); 48 48 49 49 - DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); 49 49 + DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions) = { 50 50 + .bh_lock = INIT_LOCAL_LOCK(bh_lock), 51 51 + }; 50 52 static struct net_device *mptcp_napi_dev; 51 53 52 54 /* Returns end sequence number of the receiver's advertised window */

+8 -1

net/mptcp/protocol.h

reviewed

··· 479 479 480 480 struct mptcp_delegated_action { 481 481 struct napi_struct napi; 482 482 + local_lock_t bh_lock; 482 483 struct list_head head; 483 484 }; 484 485 ··· 671 670 if (WARN_ON_ONCE(!list_empty(&subflow->delegated_node))) 672 671 return; 673 672 673 673 + local_lock_nested_bh(&mptcp_delegated_actions.bh_lock); 674 674 delegated = this_cpu_ptr(&mptcp_delegated_actions); 675 675 schedule = list_empty(&delegated->head); 676 676 list_add_tail(&subflow->delegated_node, &delegated->head); 677 677 + local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); 677 678 sock_hold(mptcp_subflow_tcp_sock(subflow)); 678 679 if (schedule) 679 680 napi_schedule(&delegated->napi); ··· 687 684 { 688 685 struct mptcp_subflow_context *ret; 689 686 690 690 - if (list_empty(&delegated->head)) 687 687 + local_lock_nested_bh(&mptcp_delegated_actions.bh_lock); 688 688 + if (list_empty(&delegated->head)) { 689 689 + local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); 691 690 return NULL; 691 691 + } 692 692 693 693 ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node); 694 694 list_del_init(&ret->delegated_node); 695 695 + local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); 695 696 return ret; 696 697 } 697 698

+13 -73

net/openvswitch/actions.c

reviewed

··· 39 39 #include "flow_netlink.h" 40 40 #include "openvswitch_trace.h" 41 41 42 42 - struct deferred_action { 43 43 - struct sk_buff *skb; 44 44 - const struct nlattr *actions; 45 45 - int actions_len; 46 46 - 47 47 - /* Store pkt_key clone when creating deferred action. */ 48 48 - struct sw_flow_key pkt_key; 42 42 + DEFINE_PER_CPU(struct ovs_pcpu_storage, ovs_pcpu_storage) = { 43 43 + .bh_lock = INIT_LOCAL_LOCK(bh_lock), 49 44 }; 50 50 - 51 51 - #define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN) 52 52 - struct ovs_frag_data { 53 53 - unsigned long dst; 54 54 - struct vport *vport; 55 55 - struct ovs_skb_cb cb; 56 56 - __be16 inner_protocol; 57 57 - u16 network_offset; /* valid only for MPLS */ 58 58 - u16 vlan_tci; 59 59 - __be16 vlan_proto; 60 60 - unsigned int l2_len; 61 61 - u8 mac_proto; 62 62 - u8 l2_data[MAX_L2_LEN]; 63 63 - }; 64 64 - 65 65 - static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage); 66 66 - 67 67 - #define DEFERRED_ACTION_FIFO_SIZE 10 68 68 - #define OVS_RECURSION_LIMIT 5 69 69 - #define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2) 70 70 - struct action_fifo { 71 71 - int head; 72 72 - int tail; 73 73 - /* Deferred action fifo queue storage. */ 74 74 - struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; 75 75 - }; 76 76 - 77 77 - struct action_flow_keys { 78 78 - struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD]; 79 79 - }; 80 80 - 81 81 - static struct action_fifo __percpu *action_fifos; 82 82 - static struct action_flow_keys __percpu *flow_keys; 83 83 - static DEFINE_PER_CPU(int, exec_actions_level); 84 45 85 46 /* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys' 86 47 * space. Return NULL if out of key spaces. 87 48 */ 88 49 static struct sw_flow_key *clone_key(const struct sw_flow_key *key_) 89 50 { 90 90 - struct action_flow_keys *keys = this_cpu_ptr(flow_keys); 91 91 - int level = this_cpu_read(exec_actions_level); 51 51 + struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(&ovs_pcpu_storage); 52 52 + struct action_flow_keys *keys = &ovs_pcpu->flow_keys; 53 53 + int level = ovs_pcpu->exec_level; 92 54 struct sw_flow_key *key = NULL; 93 55 94 56 if (level <= OVS_DEFERRED_ACTION_THRESHOLD) { ··· 94 132 const struct nlattr *actions, 95 133 const int actions_len) 96 134 { 97 97 - struct action_fifo *fifo; 135 135 + struct action_fifo *fifo = this_cpu_ptr(&ovs_pcpu_storage.action_fifos); 98 136 struct deferred_action *da; 99 137 100 100 - fifo = this_cpu_ptr(action_fifos); 101 138 da = action_fifo_put(fifo); 102 139 if (da) { 103 140 da->skb = skb; ··· 755 794 static int ovs_vport_output(struct net *net, struct sock *sk, 756 795 struct sk_buff *skb) 757 796 { 758 758 - struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage); 797 797 + struct ovs_frag_data *data = this_cpu_ptr(&ovs_pcpu_storage.frag_data); 759 798 struct vport *vport = data->vport; 760 799 761 800 if (skb_cow_head(skb, data->l2_len) < 0) { ··· 807 846 unsigned int hlen = skb_network_offset(skb); 808 847 struct ovs_frag_data *data; 809 848 810 810 - data = this_cpu_ptr(&ovs_frag_data_storage); 849 849 + data = this_cpu_ptr(&ovs_pcpu_storage.frag_data); 811 850 data->dst = skb->_skb_refdst; 812 851 data->vport = vport; 813 852 data->cb = *OVS_CB(skb); ··· 1569 1608 1570 1609 if (actions) { /* Sample action */ 1571 1610 if (clone_flow_key) 1572 1572 - __this_cpu_inc(exec_actions_level); 1611 1611 + __this_cpu_inc(ovs_pcpu_storage.exec_level); 1573 1612 1574 1613 err = do_execute_actions(dp, skb, clone, 1575 1614 actions, len); 1576 1615 1577 1616 if (clone_flow_key) 1578 1578 - __this_cpu_dec(exec_actions_level); 1617 1617 + __this_cpu_dec(ovs_pcpu_storage.exec_level); 1579 1618 } else { /* Recirc action */ 1580 1619 clone->recirc_id = recirc_id; 1581 1620 ovs_dp_process_packet(skb, clone); ··· 1611 1650 1612 1651 static void process_deferred_actions(struct datapath *dp) 1613 1652 { 1614 1614 - struct action_fifo *fifo = this_cpu_ptr(action_fifos); 1653 1653 + struct action_fifo *fifo = this_cpu_ptr(&ovs_pcpu_storage.action_fifos); 1615 1654 1616 1655 /* Do not touch the FIFO in case there is no deferred actions. */ 1617 1656 if (action_fifo_is_empty(fifo)) ··· 1642 1681 { 1643 1682 int err, level; 1644 1683 1645 1645 - level = __this_cpu_inc_return(exec_actions_level); 1684 1684 + level = __this_cpu_inc_return(ovs_pcpu_storage.exec_level); 1646 1685 if (unlikely(level > OVS_RECURSION_LIMIT)) { 1647 1686 net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n", 1648 1687 ovs_dp_name(dp)); ··· 1659 1698 process_deferred_actions(dp); 1660 1699 1661 1700 out: 1662 1662 - __this_cpu_dec(exec_actions_level); 1701 1701 + __this_cpu_dec(ovs_pcpu_storage.exec_level); 1663 1702 return err; 1664 1664 - } 1665 1665 - 1666 1666 - int action_fifos_init(void) 1667 1667 - { 1668 1668 - action_fifos = alloc_percpu(struct action_fifo); 1669 1669 - if (!action_fifos) 1670 1670 - return -ENOMEM; 1671 1671 - 1672 1672 - flow_keys = alloc_percpu(struct action_flow_keys); 1673 1673 - if (!flow_keys) { 1674 1674 - free_percpu(action_fifos); 1675 1675 - return -ENOMEM; 1676 1676 - } 1677 1677 - 1678 1678 - return 0; 1679 1679 - } 1680 1680 - 1681 1681 - void action_fifos_exit(void) 1682 1682 - { 1683 1683 - free_percpu(action_fifos); 1684 1684 - free_percpu(flow_keys); 1685 1703 }

+25 -8

net/openvswitch/datapath.c

reviewed

··· 244 244 /* Must be called with rcu_read_lock. */ 245 245 void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) 246 246 { 247 247 + struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(&ovs_pcpu_storage); 247 248 const struct vport *p = OVS_CB(skb)->input_vport; 248 249 struct datapath *dp = p->dp; 249 250 struct sw_flow *flow; 250 251 struct sw_flow_actions *sf_acts; 251 252 struct dp_stats_percpu *stats; 253 253 + bool ovs_pcpu_locked = false; 252 254 u64 *stats_counter; 253 255 u32 n_mask_hit; 254 256 u32 n_cache_hit; ··· 292 290 293 291 ovs_flow_stats_update(flow, key->tp.flags, skb); 294 292 sf_acts = rcu_dereference(flow->sf_acts); 293 293 + /* This path can be invoked recursively: Use the current task to 294 294 + * identify recursive invocation - the lock must be acquired only once. 295 295 + * Even with disabled bottom halves this can be preempted on PREEMPT_RT. 296 296 + * Limit the locking to RT to avoid assigning `owner' if it can be 297 297 + * avoided. 298 298 + */ 299 299 + if (IS_ENABLED(CONFIG_PREEMPT_RT) && ovs_pcpu->owner != current) { 300 300 + local_lock_nested_bh(&ovs_pcpu_storage.bh_lock); 301 301 + ovs_pcpu->owner = current; 302 302 + ovs_pcpu_locked = true; 303 303 + } 304 304 + 295 305 error = ovs_execute_actions(dp, skb, sf_acts, key); 296 306 if (unlikely(error)) 297 307 net_dbg_ratelimited("ovs: action execution error on datapath %s: %d\n", 298 308 ovs_dp_name(dp), error); 309 309 + if (ovs_pcpu_locked) { 310 310 + ovs_pcpu->owner = NULL; 311 311 + local_unlock_nested_bh(&ovs_pcpu_storage.bh_lock); 312 312 + } 299 313 300 314 stats_counter = &stats->n_hit; 301 315 ··· 689 671 sf_acts = rcu_dereference(flow->sf_acts); 690 672 691 673 local_bh_disable(); 674 674 + local_lock_nested_bh(&ovs_pcpu_storage.bh_lock); 675 675 + if (IS_ENABLED(CONFIG_PREEMPT_RT)) 676 676 + this_cpu_write(ovs_pcpu_storage.owner, current); 692 677 err = ovs_execute_actions(dp, packet, sf_acts, &flow->key); 678 678 + if (IS_ENABLED(CONFIG_PREEMPT_RT)) 679 679 + this_cpu_write(ovs_pcpu_storage.owner, NULL); 680 680 + local_unlock_nested_bh(&ovs_pcpu_storage.bh_lock); 693 681 local_bh_enable(); 694 682 rcu_read_unlock(); 695 683 ··· 2753 2729 2754 2730 pr_info("Open vSwitch switching datapath\n"); 2755 2731 2756 2756 - err = action_fifos_init(); 2757 2757 - if (err) 2758 2758 - goto error; 2759 2759 - 2760 2732 err = ovs_internal_dev_rtnl_link_register(); 2761 2733 if (err) 2762 2762 - goto error_action_fifos_exit; 2734 2734 + goto error; 2763 2735 2764 2736 err = ovs_flow_init(); 2765 2737 if (err) ··· 2798 2778 ovs_flow_exit(); 2799 2779 error_unreg_rtnl_link: 2800 2780 ovs_internal_dev_rtnl_link_unregister(); 2801 2801 - error_action_fifos_exit: 2802 2802 - action_fifos_exit(); 2803 2781 error: 2804 2782 return err; 2805 2783 } ··· 2813 2795 ovs_vport_exit(); 2814 2796 ovs_flow_exit(); 2815 2797 ovs_internal_dev_rtnl_link_unregister(); 2816 2816 - action_fifos_exit(); 2817 2798 } 2818 2799 2819 2800 module_init(dp_init);

+49 -3

net/openvswitch/datapath.h

reviewed

··· 13 13 #include <linux/skbuff.h> 14 14 #include <linux/u64_stats_sync.h> 15 15 #include <net/ip_tunnels.h> 16 16 + #include <net/mpls.h> 16 17 17 18 #include "conntrack.h" 18 19 #include "flow.h" ··· 174 173 bool xt_label; 175 174 }; 176 175 176 176 + #define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN) 177 177 + struct ovs_frag_data { 178 178 + unsigned long dst; 179 179 + struct vport *vport; 180 180 + struct ovs_skb_cb cb; 181 181 + __be16 inner_protocol; 182 182 + u16 network_offset; /* valid only for MPLS */ 183 183 + u16 vlan_tci; 184 184 + __be16 vlan_proto; 185 185 + unsigned int l2_len; 186 186 + u8 mac_proto; 187 187 + u8 l2_data[MAX_L2_LEN]; 188 188 + }; 189 189 + 190 190 + struct deferred_action { 191 191 + struct sk_buff *skb; 192 192 + const struct nlattr *actions; 193 193 + int actions_len; 194 194 + 195 195 + /* Store pkt_key clone when creating deferred action. */ 196 196 + struct sw_flow_key pkt_key; 197 197 + }; 198 198 + 199 199 + #define DEFERRED_ACTION_FIFO_SIZE 10 200 200 + #define OVS_RECURSION_LIMIT 5 201 201 + #define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2) 202 202 + 203 203 + struct action_fifo { 204 204 + int head; 205 205 + int tail; 206 206 + /* Deferred action fifo queue storage. */ 207 207 + struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; 208 208 + }; 209 209 + 210 210 + struct action_flow_keys { 211 211 + struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD]; 212 212 + }; 213 213 + 214 214 + struct ovs_pcpu_storage { 215 215 + struct action_fifo action_fifos; 216 216 + struct action_flow_keys flow_keys; 217 217 + struct ovs_frag_data frag_data; 218 218 + int exec_level; 219 219 + struct task_struct *owner; 220 220 + local_lock_t bh_lock; 221 221 + }; 222 222 + DECLARE_PER_CPU(struct ovs_pcpu_storage, ovs_pcpu_storage); 223 223 + 177 224 /** 178 225 * enum ovs_pkt_hash_types - hash info to include with a packet 179 226 * to send to userspace. ··· 329 280 const struct sw_flow_actions *, struct sw_flow_key *); 330 281 331 282 void ovs_dp_notify_wq(struct work_struct *work); 332 332 - 333 333 - int action_fifos_init(void); 334 334 - void action_fifos_exit(void); 335 283 336 284 /* 'KEY' must not have any bits set outside of the 'MASK' */ 337 285 #define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK)))

+14 -11

net/rds/page.c

reviewed

··· 40 40 struct rds_page_remainder { 41 41 struct page *r_page; 42 42 unsigned long r_offset; 43 43 + local_lock_t bh_lock; 43 44 }; 44 45 45 45 - static 46 46 - DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders); 46 46 + static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders) = { 47 47 + .bh_lock = INIT_LOCAL_LOCK(bh_lock), 48 48 + }; 47 49 48 50 /** 49 51 * rds_page_remainder_alloc - build up regions of a message. ··· 71 69 gfp_t gfp) 72 70 { 73 71 struct rds_page_remainder *rem; 74 74 - unsigned long flags; 75 72 struct page *page; 76 73 int ret; 77 74 ··· 88 87 goto out; 89 88 } 90 89 91 91 - rem = &per_cpu(rds_page_remainders, get_cpu()); 92 92 - local_irq_save(flags); 90 90 + local_bh_disable(); 91 91 + local_lock_nested_bh(&rds_page_remainders.bh_lock); 92 92 + rem = this_cpu_ptr(&rds_page_remainders); 93 93 94 94 while (1) { 95 95 /* avoid a tiny region getting stuck by tossing it */ ··· 118 116 } 119 117 120 118 /* alloc if there is nothing for us to use */ 121 121 - local_irq_restore(flags); 122 122 - put_cpu(); 119 119 + local_unlock_nested_bh(&rds_page_remainders.bh_lock); 120 120 + local_bh_enable(); 123 121 124 122 page = alloc_page(gfp); 125 123 126 126 - rem = &per_cpu(rds_page_remainders, get_cpu()); 127 127 - local_irq_save(flags); 124 124 + local_bh_disable(); 125 125 + local_lock_nested_bh(&rds_page_remainders.bh_lock); 126 126 + rem = this_cpu_ptr(&rds_page_remainders); 128 127 129 128 if (!page) { 130 129 ret = -ENOMEM; ··· 143 140 rem->r_offset = 0; 144 141 } 145 142 146 146 - local_irq_restore(flags); 147 147 - put_cpu(); 143 143 + local_unlock_nested_bh(&rds_page_remainders.bh_lock); 144 144 + local_bh_enable(); 148 145 out: 149 146 rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret, 150 147 ret ? NULL : sg_page(scat), ret ? 0 : scat->offset,

+25 -3

net/sched/act_mirred.c

reviewed

··· 30 30 static DEFINE_SPINLOCK(mirred_list_lock); 31 31 32 32 #define MIRRED_NEST_LIMIT 4 33 33 - static DEFINE_PER_CPU(unsigned int, mirred_nest_level); 33 33 + 34 34 + #ifndef CONFIG_PREEMPT_RT 35 35 + static u8 tcf_mirred_nest_level_inc_return(void) 36 36 + { 37 37 + return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest); 38 38 + } 39 39 + 40 40 + static void tcf_mirred_nest_level_dec(void) 41 41 + { 42 42 + __this_cpu_dec(softnet_data.xmit.sched_mirred_nest); 43 43 + } 44 44 + 45 45 + #else 46 46 + static u8 tcf_mirred_nest_level_inc_return(void) 47 47 + { 48 48 + return current->net_xmit.sched_mirred_nest++; 49 49 + } 50 50 + 51 51 + static void tcf_mirred_nest_level_dec(void) 52 52 + { 53 53 + current->net_xmit.sched_mirred_nest--; 54 54 + } 55 55 + #endif 34 56 35 57 static bool tcf_mirred_is_act_redirect(int action) 36 58 { ··· 445 423 int m_eaction; 446 424 u32 blockid; 447 425 448 448 - nest_level = __this_cpu_inc_return(mirred_nest_level); 426 426 + nest_level = tcf_mirred_nest_level_inc_return(); 449 427 if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { 450 428 net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", 451 429 netdev_name(skb->dev)); ··· 476 454 retval); 477 455 478 456 dec_nest_level: 479 479 - __this_cpu_dec(mirred_nest_level); 457 457 + tcf_mirred_nest_level_dec(); 480 458 481 459 return retval; 482 460 }

+9 -1

net/sched/sch_frag.c

reviewed

··· 16 16 unsigned int l2_len; 17 17 u8 l2_data[VLAN_ETH_HLEN]; 18 18 int (*xmit)(struct sk_buff *skb); 19 19 + local_lock_t bh_lock; 19 20 }; 20 21 21 21 - static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage); 22 22 + static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage) = { 23 23 + .bh_lock = INIT_LOCAL_LOCK(bh_lock), 24 24 + }; 22 25 23 26 static int sch_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) 24 27 { 25 28 struct sch_frag_data *data = this_cpu_ptr(&sch_frag_data_storage); 26 29 30 30 + lockdep_assert_held(&data->bh_lock); 27 31 if (skb_cow_head(skb, data->l2_len) < 0) { 28 32 kfree_skb(skb); 29 33 return -ENOMEM; ··· 99 95 struct rtable sch_frag_rt = { 0 }; 100 96 unsigned long orig_dst; 101 97 98 98 + local_lock_nested_bh(&sch_frag_data_storage.bh_lock); 102 99 sch_frag_prepare_frag(skb, xmit); 103 100 dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, 104 101 DST_OBSOLETE_NONE, DST_NOCOUNT); ··· 110 105 IPCB(skb)->frag_max_size = mru; 111 106 112 107 ret = ip_do_fragment(net, skb->sk, skb, sch_frag_xmit); 108 108 + local_unlock_nested_bh(&sch_frag_data_storage.bh_lock); 113 109 refdst_drop(orig_dst); 114 110 } else if (skb_protocol(skb, true) == htons(ETH_P_IPV6)) { 115 111 unsigned long orig_dst; 116 112 struct rt6_info sch_frag_rt; 117 113 114 114 + local_lock_nested_bh(&sch_frag_data_storage.bh_lock); 118 115 sch_frag_prepare_frag(skb, xmit); 119 116 memset(&sch_frag_rt, 0, sizeof(sch_frag_rt)); 120 117 dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, ··· 129 122 130 123 ret = ipv6_stub->ipv6_fragment(net, skb->sk, skb, 131 124 sch_frag_xmit); 125 125 + local_unlock_nested_bh(&sch_frag_data_storage.bh_lock); 132 126 refdst_drop(orig_dst); 133 127 } else { 134 128 net_warn_ratelimited("Fail frag %s: eth=%x, MRU=%d, MTU=%d\n",

+20 -10

net/xfrm/xfrm_nat_keepalive.c

reviewed

··· 9 9 #include <net/ip6_checksum.h> 10 10 #include <net/xfrm.h> 11 11 12 12 - static DEFINE_PER_CPU(struct sock *, nat_keepalive_sk_ipv4); 12 12 + static DEFINE_PER_CPU(struct sock_bh_locked, nat_keepalive_sk_ipv4) = { 13 13 + .bh_lock = INIT_LOCAL_LOCK(bh_lock), 14 14 + }; 13 15 #if IS_ENABLED(CONFIG_IPV6) 14 14 - static DEFINE_PER_CPU(struct sock *, nat_keepalive_sk_ipv6); 16 16 + static DEFINE_PER_CPU(struct sock_bh_locked, nat_keepalive_sk_ipv6) = { 17 17 + .bh_lock = INIT_LOCAL_LOCK(bh_lock), 18 18 + }; 15 19 #endif 16 20 17 21 struct nat_keepalive { ··· 60 56 61 57 skb_dst_set(skb, &rt->dst); 62 58 63 63 - sk = *this_cpu_ptr(&nat_keepalive_sk_ipv4); 59 59 + local_lock_nested_bh(&nat_keepalive_sk_ipv4.bh_lock); 60 60 + sk = this_cpu_read(nat_keepalive_sk_ipv4.sock); 64 61 sock_net_set(sk, net); 65 62 err = ip_build_and_send_pkt(skb, sk, fl4.saddr, fl4.daddr, NULL, tos); 66 63 sock_net_set(sk, &init_net); 64 64 + local_unlock_nested_bh(&nat_keepalive_sk_ipv4.bh_lock); 67 65 return err; 68 66 } 69 67 ··· 95 89 fl6.fl6_sport = ka->encap_sport; 96 90 fl6.fl6_dport = ka->encap_dport; 97 91 98 98 - sk = *this_cpu_ptr(&nat_keepalive_sk_ipv6); 92 92 + local_lock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock); 93 93 + sk = this_cpu_read(nat_keepalive_sk_ipv6.sock); 99 94 sock_net_set(sk, net); 100 95 dst = ipv6_stub->ipv6_dst_lookup_flow(net, sk, &fl6, NULL); 101 101 - if (IS_ERR(dst)) 96 96 + if (IS_ERR(dst)) { 97 97 + local_unlock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock); 102 98 return PTR_ERR(dst); 99 99 + } 103 100 104 101 skb_dst_set(skb, dst); 105 102 err = ipv6_stub->ip6_xmit(sk, skb, &fl6, skb->mark, NULL, 0, 0); 106 103 sock_net_set(sk, &init_net); 104 104 + local_unlock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock); 107 105 return err; 108 106 } 109 107 #endif ··· 212 202 (ctx.next_run - ctx.now) * HZ); 213 203 } 214 204 215 215 - static int nat_keepalive_sk_init(struct sock * __percpu *socks, 205 205 + static int nat_keepalive_sk_init(struct sock_bh_locked __percpu *socks, 216 206 unsigned short family) 217 207 { 218 208 struct sock *sk; ··· 224 214 if (err < 0) 225 215 goto err; 226 216 227 227 - *per_cpu_ptr(socks, i) = sk; 217 217 + per_cpu_ptr(socks, i)->sock = sk; 228 218 } 229 219 230 220 return 0; 231 221 err: 232 222 for_each_possible_cpu(i) 233 233 - inet_ctl_sock_destroy(*per_cpu_ptr(socks, i)); 223 223 + inet_ctl_sock_destroy(per_cpu_ptr(socks, i)->sock); 234 224 return err; 235 225 } 236 226 237 237 - static void nat_keepalive_sk_fini(struct sock * __percpu *socks) 227 227 + static void nat_keepalive_sk_fini(struct sock_bh_locked __percpu *socks) 238 228 { 239 229 int i; 240 230 241 231 for_each_possible_cpu(i) 242 242 - inet_ctl_sock_destroy(*per_cpu_ptr(socks, i)); 232 232 + inet_ctl_sock_destroy(per_cpu_ptr(socks, i)->sock); 243 233 } 244 234 245 235 void xfrm_nat_keepalive_state_updated(struct xfrm_state *x)