Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'rfs-use-high-order-allocations-for-hash-tables'

Eric Dumazet says:

====================
rfs: use high-order allocations for hash tables

This series adds rps_tag_ptr which encodes both a pointer
and a size of a power-of-two hash table in a single long word.

RFS hash tables (global and per rx-queue) are converted to rps_tag_ptr.

This removes a cache line miss, and allows high-order allocations.

The global hash table can benefit from huge pages.
====================

Link: https://patch.msgid.link/20260302181432.1836150-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+180 -152
+9 -4
Documentation/networking/scaling.rst
··· 403 403 Values for both are rounded up to the nearest power of two. The 404 404 suggested flow count depends on the expected number of active connections 405 405 at any given time, which may be significantly less than the number of open 406 - connections. We have found that a value of 32768 for rps_sock_flow_entries 407 - works fairly well on a moderately loaded server. 406 + connections. We have found that a value of 65536 for rps_sock_flow_entries 407 + works fairly well on a moderately loaded server. Big servers might 408 + need 1048576 or even higher values. 409 + 410 + On a NUMA host it is advisable to spread rps_sock_flow_entries on all nodes. 411 + 412 + numactl --interleave=all bash -c "echo 1048576 >/proc/sys/net/core/rps_sock_flow_entries" 408 413 409 414 For a single queue device, the rps_flow_cnt value for the single queue 410 415 would normally be configured to the same value as rps_sock_flow_entries. 411 416 For a multi-queue device, the rps_flow_cnt for each queue might be 412 417 configured as rps_sock_flow_entries / N, where N is the number of 413 - queues. So for instance, if rps_sock_flow_entries is set to 32768 and there 418 + queues. So for instance, if rps_sock_flow_entries is set to 131072 and there 414 419 are 16 configured receive queues, rps_flow_cnt for each queue might be 415 - configured as 2048. 420 + configured as 8192. 416 421 417 422 418 423 Accelerated RFS
+4 -1
include/net/hotdata.h
··· 6 6 #include <linux/types.h> 7 7 #include <linux/netdevice.h> 8 8 #include <net/protocol.h> 9 + #ifdef CONFIG_RPS 10 + #include <net/rps-types.h> 11 + #endif 9 12 10 13 struct skb_defer_node { 11 14 struct llist_head defer_list; ··· 36 33 struct kmem_cache *skbuff_fclone_cache; 37 34 struct kmem_cache *skb_small_head_cache; 38 35 #ifdef CONFIG_RPS 39 - struct rps_sock_flow_table __rcu *rps_sock_flow_table; 36 + rps_tag_ptr rps_sock_flow_table; 40 37 u32 rps_cpu_mask; 41 38 #endif 42 39 struct skb_defer_node __percpu *skb_defer_nodes;
+2 -1
include/net/netdev_rx_queue.h
··· 8 8 #include <net/xdp.h> 9 9 #include <net/page_pool/types.h> 10 10 #include <net/netdev_queues.h> 11 + #include <net/rps-types.h> 11 12 12 13 /* This structure contains an instance of an RX queue. */ 13 14 struct netdev_rx_queue { 14 15 struct xdp_rxq_info xdp_rxq; 15 16 #ifdef CONFIG_RPS 16 17 struct rps_map __rcu *rps_map; 17 - struct rps_dev_flow_table __rcu *rps_flow_table; 18 + rps_tag_ptr rps_flow_table; 18 19 #endif 19 20 struct kobject kobj; 20 21 const struct attribute_group **groups;
+24
include/net/rps-types.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + #ifndef _NET_RPS_TYPES_H 3 + #define _NET_RPS_TYPES_H 4 + 5 + /* Define a rps_tag_ptr: 6 + * Low order 5 bits are used to store the ilog2(size) of an RPS table. 7 + */ 8 + typedef unsigned long rps_tag_ptr; 9 + 10 + static inline u8 rps_tag_to_log(rps_tag_ptr tag_ptr) 11 + { 12 + return tag_ptr & 31U; 13 + } 14 + 15 + static inline u32 rps_tag_to_mask(rps_tag_ptr tag_ptr) 16 + { 17 + return (1U << rps_tag_to_log(tag_ptr)) - 1; 18 + } 19 + 20 + static inline void *rps_tag_to_table(rps_tag_ptr tag_ptr) 21 + { 22 + return (void *)(tag_ptr & ~31UL); 23 + } 24 + #endif /* _NET_RPS_TYPES_H */
+19 -30
include/net/rps.h
··· 8 8 #include <net/hotdata.h> 9 9 10 10 #ifdef CONFIG_RPS 11 + #include <net/rps-types.h> 11 12 12 13 extern struct static_key_false rps_needed; 13 14 extern struct static_key_false rfs_needed; ··· 40 39 #define RPS_NO_FILTER 0xffff 41 40 42 41 /* 43 - * The rps_dev_flow_table structure contains a table of flow mappings. 44 - */ 45 - struct rps_dev_flow_table { 46 - u8 log; 47 - struct rcu_head rcu; 48 - struct rps_dev_flow flows[]; 49 - }; 50 - #define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \ 51 - ((_num) * sizeof(struct rps_dev_flow))) 52 - 53 - /* 54 42 * The rps_sock_flow_table contains mappings of flows to the last CPU 55 43 * on which they were processed by the application (set in recvmsg). 56 44 * Each entry is a 32bit value. Upper part is the high-order bits ··· 50 60 * meaning we use 32-6=26 bits for the hash. 51 61 */ 52 62 struct rps_sock_flow_table { 53 - struct rcu_head rcu; 54 - u32 mask; 55 - 56 - u32 ents[] ____cacheline_aligned_in_smp; 63 + u32 ent; 57 64 }; 58 - #define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) 59 65 60 66 #define RPS_NO_CPU 0xffff 61 67 62 - static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, 63 - u32 hash) 68 + static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash) 64 69 { 65 - unsigned int index = hash & table->mask; 70 + unsigned int index = hash & rps_tag_to_mask(tag_ptr); 66 71 u32 val = hash & ~net_hotdata.rps_cpu_mask; 72 + struct rps_sock_flow_table *table; 67 73 68 74 /* We only give a hint, preemption can change CPU under us */ 69 75 val |= raw_smp_processor_id(); 70 76 77 + table = rps_tag_to_table(tag_ptr); 71 78 /* The following WRITE_ONCE() is paired with the READ_ONCE() 72 79 * here, and another one in get_rps_cpu(). 73 80 */ 74 - if (READ_ONCE(table->ents[index]) != val) 75 - WRITE_ONCE(table->ents[index], val); 81 + if (READ_ONCE(table[index].ent) != val) 82 + WRITE_ONCE(table[index].ent, val); 76 83 } 77 84 78 85 static inline void _sock_rps_record_flow_hash(__u32 hash) 79 86 { 80 - struct rps_sock_flow_table *sock_flow_table; 87 + rps_tag_ptr tag_ptr; 81 88 82 89 if (!hash) 83 90 return; 84 91 rcu_read_lock(); 85 - sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); 86 - if (sock_flow_table) 87 - rps_record_sock_flow(sock_flow_table, hash); 92 + tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table); 93 + if (tag_ptr) 94 + rps_record_sock_flow(tag_ptr, hash); 88 95 rcu_read_unlock(); 89 96 } 90 97 ··· 108 121 static inline void _sock_rps_delete_flow(const struct sock *sk) 109 122 { 110 123 struct rps_sock_flow_table *table; 124 + rps_tag_ptr tag_ptr; 111 125 u32 hash, index; 112 126 113 127 hash = READ_ONCE(sk->sk_rxhash); ··· 116 128 return; 117 129 118 130 rcu_read_lock(); 119 - table = rcu_dereference(net_hotdata.rps_sock_flow_table); 120 - if (table) { 121 - index = hash & table->mask; 122 - if (READ_ONCE(table->ents[index]) != RPS_NO_CPU) 123 - WRITE_ONCE(table->ents[index], RPS_NO_CPU); 131 + tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table); 132 + if (tag_ptr) { 133 + index = hash & rps_tag_to_mask(tag_ptr); 134 + table = rps_tag_to_table(tag_ptr); 135 + if (READ_ONCE(table[index].ent) != RPS_NO_CPU) 136 + WRITE_ONCE(table[index].ent, RPS_NO_CPU); 124 137 } 125 138 rcu_read_unlock(); 126 139 }
+37 -24
net/core/dev.c
··· 4968 4968 struct static_key_false rfs_needed __read_mostly; 4969 4969 EXPORT_SYMBOL(rfs_needed); 4970 4970 4971 - static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) 4971 + static u32 rfs_slot(u32 hash, rps_tag_ptr tag_ptr) 4972 4972 { 4973 - return hash_32(hash, flow_table->log); 4973 + return hash_32(hash, rps_tag_to_log(tag_ptr)); 4974 4974 } 4975 4975 4976 4976 #ifdef CONFIG_RFS_ACCEL 4977 4977 /** 4978 4978 * rps_flow_is_active - check whether the flow is recently active. 4979 4979 * @rflow: Specific flow to check activity. 4980 - * @flow_table: per-queue flowtable that @rflow belongs to. 4980 + * @log: ilog2(hashsize). 4981 4981 * @cpu: CPU saved in @rflow. 4982 4982 * 4983 4983 * If the CPU has processed many packets since the flow's last activity ··· 4986 4986 * Return: true if flow was recently active. 4987 4987 */ 4988 4988 static bool rps_flow_is_active(struct rps_dev_flow *rflow, 4989 - struct rps_dev_flow_table *flow_table, 4989 + u8 log, 4990 4990 unsigned int cpu) 4991 4991 { 4992 4992 unsigned int flow_last_active; ··· 4999 4999 flow_last_active = READ_ONCE(rflow->last_qtail); 5000 5000 5001 5001 return (int)(sd_input_head - flow_last_active) < 5002 - (int)(10 << flow_table->log); 5002 + (int)(10 << log); 5003 5003 } 5004 5004 #endif 5005 5005 ··· 5011 5011 u32 head; 5012 5012 #ifdef CONFIG_RFS_ACCEL 5013 5013 struct netdev_rx_queue *rxqueue; 5014 - struct rps_dev_flow_table *flow_table; 5014 + struct rps_dev_flow *flow_table; 5015 5015 struct rps_dev_flow *old_rflow; 5016 5016 struct rps_dev_flow *tmp_rflow; 5017 + rps_tag_ptr q_tag_ptr; 5017 5018 unsigned int tmp_cpu; 5018 5019 u16 rxq_index; 5019 5020 u32 flow_id; ··· 5029 5028 goto out; 5030 5029 5031 5030 rxqueue = dev->_rx + rxq_index; 5032 - flow_table = rcu_dereference(rxqueue->rps_flow_table); 5033 - if (!flow_table) 5031 + q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table); 5032 + if (!q_tag_ptr) 5034 5033 goto out; 5035 5034 5036 - flow_id = rfs_slot(hash, flow_table); 5037 - tmp_rflow = &flow_table->flows[flow_id]; 5035 + flow_id = rfs_slot(hash, q_tag_ptr); 5036 + flow_table = rps_tag_to_table(q_tag_ptr); 5037 + tmp_rflow = flow_table + flow_id; 5038 5038 tmp_cpu = READ_ONCE(tmp_rflow->cpu); 5039 5039 5040 5040 if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) { 5041 - if (rps_flow_is_active(tmp_rflow, flow_table, 5041 + if (rps_flow_is_active(tmp_rflow, 5042 + rps_tag_to_log(q_tag_ptr), 5042 5043 tmp_cpu)) { 5043 5044 if (hash != READ_ONCE(tmp_rflow->hash) || 5044 5045 next_cpu == tmp_cpu) ··· 5078 5075 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, 5079 5076 struct rps_dev_flow **rflowp) 5080 5077 { 5081 - const struct rps_sock_flow_table *sock_flow_table; 5082 5078 struct netdev_rx_queue *rxqueue = dev->_rx; 5083 - struct rps_dev_flow_table *flow_table; 5079 + rps_tag_ptr global_tag_ptr, q_tag_ptr; 5084 5080 struct rps_map *map; 5085 5081 int cpu = -1; 5086 5082 u32 tcpu; ··· 5100 5098 5101 5099 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ 5102 5100 5103 - flow_table = rcu_dereference(rxqueue->rps_flow_table); 5101 + q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table); 5104 5102 map = rcu_dereference(rxqueue->rps_map); 5105 - if (!flow_table && !map) 5103 + if (!q_tag_ptr && !map) 5106 5104 goto done; 5107 5105 5108 5106 skb_reset_network_header(skb); ··· 5110 5108 if (!hash) 5111 5109 goto done; 5112 5110 5113 - sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); 5114 - if (flow_table && sock_flow_table) { 5111 + global_tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table); 5112 + if (q_tag_ptr && global_tag_ptr) { 5113 + struct rps_sock_flow_table *sock_flow_table; 5114 + struct rps_dev_flow *flow_table; 5115 5115 struct rps_dev_flow *rflow; 5116 5116 u32 next_cpu; 5117 + u32 flow_id; 5117 5118 u32 ident; 5118 5119 5119 5120 /* First check into global flow table if there is a match. 5120 5121 * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow(). 5121 5122 */ 5122 - ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]); 5123 + flow_id = hash & rps_tag_to_mask(global_tag_ptr); 5124 + sock_flow_table = rps_tag_to_table(global_tag_ptr); 5125 + ident = READ_ONCE(sock_flow_table[flow_id].ent); 5123 5126 if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask) 5124 5127 goto try_rps; 5125 5128 ··· 5133 5126 /* OK, now we know there is a match, 5134 5127 * we can look at the local (per receive queue) flow table 5135 5128 */ 5136 - rflow = &flow_table->flows[rfs_slot(hash, flow_table)]; 5129 + flow_id = rfs_slot(hash, q_tag_ptr); 5130 + flow_table = rps_tag_to_table(q_tag_ptr); 5131 + rflow = flow_table + flow_id; 5137 5132 tcpu = rflow->cpu; 5138 5133 5139 5134 /* ··· 5195 5186 u32 flow_id, u16 filter_id) 5196 5187 { 5197 5188 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; 5198 - struct rps_dev_flow_table *flow_table; 5189 + struct rps_dev_flow *flow_table; 5199 5190 struct rps_dev_flow *rflow; 5191 + rps_tag_ptr q_tag_ptr; 5200 5192 bool expire = true; 5193 + u8 log; 5201 5194 5202 5195 rcu_read_lock(); 5203 - flow_table = rcu_dereference(rxqueue->rps_flow_table); 5204 - if (flow_table && flow_id < (1UL << flow_table->log)) { 5196 + q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table); 5197 + log = rps_tag_to_log(q_tag_ptr); 5198 + if (q_tag_ptr && flow_id < (1UL << log)) { 5205 5199 unsigned int cpu; 5206 5200 5207 - rflow = &flow_table->flows[flow_id]; 5201 + flow_table = rps_tag_to_table(q_tag_ptr); 5202 + rflow = flow_table + flow_id; 5208 5203 cpu = READ_ONCE(rflow->cpu); 5209 5204 if (READ_ONCE(rflow->filter) == filter_id && 5210 - rps_flow_is_active(rflow, flow_table, cpu)) 5205 + rps_flow_is_active(rflow, log, cpu)) 5211 5206 expire = false; 5212 5207 } 5213 5208 rcu_read_unlock();
+35 -50
net/core/net-sysfs.c
··· 1060 1060 static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, 1061 1061 char *buf) 1062 1062 { 1063 - struct rps_dev_flow_table *flow_table; 1064 1063 unsigned long val = 0; 1064 + rps_tag_ptr tag_ptr; 1065 1065 1066 - rcu_read_lock(); 1067 - flow_table = rcu_dereference(queue->rps_flow_table); 1068 - if (flow_table) 1069 - val = 1UL << flow_table->log; 1070 - rcu_read_unlock(); 1066 + tag_ptr = READ_ONCE(queue->rps_flow_table); 1067 + if (tag_ptr) 1068 + val = 1UL << rps_tag_to_log(tag_ptr); 1071 1069 1072 1070 return sysfs_emit(buf, "%lu\n", val); 1073 - } 1074 - 1075 - static void rps_dev_flow_table_release(struct rcu_head *rcu) 1076 - { 1077 - struct rps_dev_flow_table *table = container_of(rcu, 1078 - struct rps_dev_flow_table, rcu); 1079 - vfree(table); 1080 1071 } 1081 1072 1082 1073 static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, 1083 1074 const char *buf, size_t len) 1084 1075 { 1076 + rps_tag_ptr otag, tag_ptr = 0UL; 1077 + struct rps_dev_flow *table; 1085 1078 unsigned long mask, count; 1086 - struct rps_dev_flow_table *table, *old_table; 1087 - static DEFINE_SPINLOCK(rps_dev_flow_lock); 1079 + size_t sz; 1088 1080 int rc; 1089 1081 1090 1082 if (!capable(CAP_NET_ADMIN)) ··· 1093 1101 */ 1094 1102 while ((mask | (mask >> 1)) != mask) 1095 1103 mask |= (mask >> 1); 1096 - /* On 64 bit arches, must check mask fits in table->mask (u32), 1097 - * and on 32bit arches, must check 1098 - * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow. 1099 - */ 1100 - #if BITS_PER_LONG > 32 1101 - if (mask > (unsigned long)(u32)mask) 1104 + 1105 + /* Do not accept too large tables. */ 1106 + if (mask > (INT_MAX / sizeof(*table) - 1)) 1102 1107 return -EINVAL; 1103 - #else 1104 - if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1)) 1105 - / sizeof(struct rps_dev_flow)) { 1106 - /* Enforce a limit to prevent overflow */ 1107 - return -EINVAL; 1108 - } 1109 - #endif 1110 - table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1)); 1108 + 1109 + sz = max_t(size_t, sizeof(*table) * (mask + 1), 1110 + PAGE_SIZE); 1111 + if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) || 1112 + is_power_of_2(sizeof(*table))) 1113 + table = kvmalloc(sz, GFP_KERNEL); 1114 + else 1115 + table = vmalloc(sz); 1111 1116 if (!table) 1112 1117 return -ENOMEM; 1113 - 1114 - table->log = ilog2(mask) + 1; 1115 - for (count = 0; count <= mask; count++) { 1116 - table->flows[count].cpu = RPS_NO_CPU; 1117 - table->flows[count].filter = RPS_NO_FILTER; 1118 + tag_ptr = (rps_tag_ptr)table; 1119 + if (rps_tag_to_log(tag_ptr)) { 1120 + pr_err_once("store_rps_dev_flow_table_cnt() got a non page aligned allocation.\n"); 1121 + kvfree(table); 1122 + return -ENOMEM; 1118 1123 } 1119 - } else { 1120 - table = NULL; 1124 + tag_ptr |= (ilog2(mask) + 1); 1125 + for (count = 0; count <= mask; count++) { 1126 + table[count].cpu = RPS_NO_CPU; 1127 + table[count].filter = RPS_NO_FILTER; 1128 + } 1121 1129 } 1122 1130 1123 - spin_lock(&rps_dev_flow_lock); 1124 - old_table = rcu_dereference_protected(queue->rps_flow_table, 1125 - lockdep_is_held(&rps_dev_flow_lock)); 1126 - rcu_assign_pointer(queue->rps_flow_table, table); 1127 - spin_unlock(&rps_dev_flow_lock); 1128 - 1129 - if (old_table) 1130 - call_rcu(&old_table->rcu, rps_dev_flow_table_release); 1131 + otag = xchg(&queue->rps_flow_table, tag_ptr); 1132 + if (otag) 1133 + kvfree_rcu_mightsleep(rps_tag_to_table(otag)); 1131 1134 1132 1135 return len; 1133 1136 } ··· 1148 1161 { 1149 1162 struct netdev_rx_queue *queue = to_rx_queue(kobj); 1150 1163 #ifdef CONFIG_RPS 1164 + rps_tag_ptr tag_ptr; 1151 1165 struct rps_map *map; 1152 - struct rps_dev_flow_table *flow_table; 1153 1166 1154 1167 map = rcu_dereference_protected(queue->rps_map, 1); 1155 1168 if (map) { ··· 1157 1170 kfree_rcu(map, rcu); 1158 1171 } 1159 1172 1160 - flow_table = rcu_dereference_protected(queue->rps_flow_table, 1); 1161 - if (flow_table) { 1162 - RCU_INIT_POINTER(queue->rps_flow_table, NULL); 1163 - call_rcu(&flow_table->rcu, rps_dev_flow_table_release); 1164 - } 1173 + tag_ptr = xchg(&queue->rps_flow_table, 0UL); 1174 + if (tag_ptr) 1175 + kvfree_rcu_mightsleep(rps_tag_to_table(tag_ptr)); 1165 1176 #endif 1166 1177 1167 1178 memset(kobj, 0, sizeof(*kobj));
+50 -42
net/core/sysctl_net_core.c
··· 138 138 static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, 139 139 void *buffer, size_t *lenp, loff_t *ppos) 140 140 { 141 + struct rps_sock_flow_table *o_sock_table, *sock_table; 142 + static DEFINE_MUTEX(sock_flow_mutex); 143 + rps_tag_ptr o_tag_ptr, tag_ptr; 141 144 unsigned int orig_size, size; 142 - int ret, i; 143 145 struct ctl_table tmp = { 144 146 .data = &size, 145 147 .maxlen = sizeof(size), 146 148 .mode = table->mode 147 149 }; 148 - struct rps_sock_flow_table *orig_sock_table, *sock_table; 149 - static DEFINE_MUTEX(sock_flow_mutex); 150 + void *tofree = NULL; 151 + int ret, i; 152 + u8 log; 150 153 151 154 mutex_lock(&sock_flow_mutex); 152 155 153 - orig_sock_table = rcu_dereference_protected( 154 - net_hotdata.rps_sock_flow_table, 155 - lockdep_is_held(&sock_flow_mutex)); 156 - size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; 156 + o_tag_ptr = tag_ptr = net_hotdata.rps_sock_flow_table; 157 + 158 + size = o_tag_ptr ? rps_tag_to_mask(o_tag_ptr) + 1 : 0; 159 + o_sock_table = rps_tag_to_table(o_tag_ptr); 160 + orig_size = size; 157 161 158 162 ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); 159 163 160 - if (write) { 161 - if (size) { 162 - if (size > 1<<29) { 163 - /* Enforce limit to prevent overflow */ 164 + if (!write) 165 + goto unlock; 166 + 167 + if (size) { 168 + if (size > 1<<29) { 169 + /* Enforce limit to prevent overflow */ 170 + mutex_unlock(&sock_flow_mutex); 171 + return -EINVAL; 172 + } 173 + sock_table = o_sock_table; 174 + size = roundup_pow_of_two(size); 175 + if (size != orig_size) { 176 + sock_table = vmalloc_huge(size * sizeof(*sock_table), 177 + GFP_KERNEL); 178 + if (!sock_table) { 164 179 mutex_unlock(&sock_flow_mutex); 165 - return -EINVAL; 180 + return -ENOMEM; 166 181 } 167 - size = roundup_pow_of_two(size); 168 - if (size != orig_size) { 169 - sock_table = 170 - vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size)); 171 - if (!sock_table) { 172 - mutex_unlock(&sock_flow_mutex); 173 - return -ENOMEM; 174 - } 175 - net_hotdata.rps_cpu_mask = 176 - roundup_pow_of_two(nr_cpu_ids) - 1; 177 - sock_table->mask = size - 1; 178 - } else 179 - sock_table = orig_sock_table; 182 + net_hotdata.rps_cpu_mask = 183 + roundup_pow_of_two(nr_cpu_ids) - 1; 184 + log = ilog2(size); 185 + tag_ptr = (rps_tag_ptr)sock_table | log; 186 + } 180 187 181 - for (i = 0; i < size; i++) 182 - sock_table->ents[i] = RPS_NO_CPU; 183 - } else 184 - sock_table = NULL; 185 - 186 - if (sock_table != orig_sock_table) { 187 - rcu_assign_pointer(net_hotdata.rps_sock_flow_table, 188 - sock_table); 189 - if (sock_table) { 190 - static_branch_inc(&rps_needed); 191 - static_branch_inc(&rfs_needed); 192 - } 193 - if (orig_sock_table) { 194 - static_branch_dec(&rps_needed); 195 - static_branch_dec(&rfs_needed); 196 - kvfree_rcu(orig_sock_table, rcu); 197 - } 188 + for (i = 0; i < size; i++) 189 + sock_table[i].ent = RPS_NO_CPU; 190 + } else { 191 + sock_table = NULL; 192 + tag_ptr = 0UL; 193 + } 194 + if (tag_ptr != o_tag_ptr) { 195 + smp_store_release(&net_hotdata.rps_sock_flow_table, tag_ptr); 196 + if (sock_table) { 197 + static_branch_inc(&rps_needed); 198 + static_branch_inc(&rfs_needed); 199 + } 200 + if (o_sock_table) { 201 + static_branch_dec(&rps_needed); 202 + static_branch_dec(&rfs_needed); 203 + tofree = o_sock_table; 198 204 } 199 205 } 200 206 207 + unlock: 201 208 mutex_unlock(&sock_flow_mutex); 202 209 210 + kvfree_rcu_mightsleep(tofree); 203 211 return ret; 204 212 } 205 213 #endif /* CONFIG_RPS */