Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'bpf-cpumap-enable-gro-for-xdp_pass-frames'

Alexander Lobakin says:

====================
bpf: cpumap: enable GRO for XDP_PASS frames

Several months ago, I had been looking through my old XDP hints tree[0]
to check whether some patches not directly related to hints can be sent
standalone. Roughly at the same time, Daniel appeared and asked[1] about
GRO for cpumap from that tree.

Currently, cpumap uses its own kthread which processes cpumap-redirected
frames by batches of 8, without any weighting (but with rescheduling
points). The resulting skbs get passed to the stack via
netif_receive_skb_list(), which means no GRO happens.
Even though we can't currently pass checksum status from the drivers,
in many cases GRO performs better than the listified Rx without the
aggregation, confirmed by tests.

In order to enable GRO in cpumap, we need to do the following:

* patches 1-2: decouple the GRO struct from the NAPI struct and allow
using it out of a NAPI entity within the kernel core code;
* patch 3: switch cpumap from netif_receive_skb_list() to
gro_receive_skb().

Additional improvements:

* patch 4: optimize XDP_PASS in cpumap by using arrays instead of linked
lists;
* patch 5-6: introduce and use function do get skbs from the NAPI percpu
caches by bulks, not one at a time;
* patch 7-8: use that function in veth as well and remove the one that
was now superseded by it.

My trafficgen UDP GRO tests, small frame sizes:

GRO off GRO on
baseline 2.7 N/A Mpps
patch 3 2.3 4 Mpps
patch 8 2.4 4.7 Mpps

1...3 diff -17 +48 %
1...8 diff -11 +74 %

Daniel reported from +14%[2] to +18%[3] of throughput in neper's TCP RR
tests. On my system however, the same test gave me up to +100%.

Note that there's a series from Lorenzo[4] which achieves the same, but
in a different way. During the discussions, the approach using a
standalone GRO instance was preferred over the threaded NAPI.

[0] https://github.com/alobakin/linux/tree/xdp_hints
[1] https://lore.kernel.org/bpf/cadda351-6e93-4568-ba26-21a760bf9a57@app.fastmail.com
[2] https://lore.kernel.org/bpf/merfatcdvwpx2lj4j2pahhwp4vihstpidws3jwljwazhh76xkd@t5vsh4gvk4mh
[3] https://lore.kernel.org/bpf/yzda66wro5twmzpmjoxvy4si5zvkehlmgtpi6brheek3sj73tj@o7kd6nurr3o6
[4] https://lore.kernel.org/bpf/20241130-cpumap-gro-v1-0-c1180b1b5758@kernel.org
====================

Link: https://patch.msgid.link/20250225171751.2268401-1-aleksander.lobakin@intel.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

+317 -182
+1
drivers/net/ethernet/brocade/bna/bnad.c
··· 19 19 #include <linux/ip.h> 20 20 #include <linux/prefetch.h> 21 21 #include <linux/module.h> 22 + #include <net/gro.h> 22 23 23 24 #include "bnad.h" 24 25 #include "bna.h"
+1
drivers/net/ethernet/cortina/gemini.c
··· 40 40 #include <linux/in.h> 41 41 #include <linux/ip.h> 42 42 #include <linux/ipv6.h> 43 + #include <net/gro.h> 43 44 44 45 #include "gemini.h" 45 46
+1 -2
drivers/net/veth.c
··· 684 684 void *skbs[VETH_XDP_BATCH]; 685 685 int i; 686 686 687 - if (xdp_alloc_skb_bulk(skbs, n_xdpf, 688 - GFP_ATOMIC | __GFP_ZERO) < 0) { 687 + if (unlikely(!napi_skb_cache_get_bulk(skbs, n_xdpf))) { 689 688 for (i = 0; i < n_xdpf; i++) 690 689 xdp_return_frame(frames[i]); 691 690 stats->rx_drops += n_xdpf;
+1
drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c
··· 41 41 #include <linux/types.h> 42 42 #include <linux/wait.h> 43 43 #include <linux/workqueue.h> 44 + #include <net/gro.h> 44 45 45 46 #include "t7xx_dpmaif.h" 46 47 #include "t7xx_hif_dpmaif.h"
+28 -9
include/linux/netdevice.h
··· 340 340 }; 341 341 342 342 /* 343 - * size of gro hash buckets, must less than bit number of 344 - * napi_struct::gro_bitmask 343 + * size of gro hash buckets, must be <= the number of bits in 344 + * gro_node::bitmask 345 345 */ 346 346 #define GRO_HASH_BUCKETS 8 347 + 348 + /** 349 + * struct gro_node - structure to support Generic Receive Offload 350 + * @bitmask: bitmask to indicate used buckets in @hash 351 + * @hash: hashtable of pending aggregated skbs, separated by flows 352 + * @rx_list: list of pending ``GRO_NORMAL`` skbs 353 + * @rx_count: cached current length of @rx_list 354 + * @cached_napi_id: napi_struct::napi_id cached for hotpath, 0 for standalone 355 + */ 356 + struct gro_node { 357 + unsigned long bitmask; 358 + struct gro_list hash[GRO_HASH_BUCKETS]; 359 + struct list_head rx_list; 360 + u32 rx_count; 361 + u32 cached_napi_id; 362 + }; 347 363 348 364 /* 349 365 * Structure for per-NAPI config ··· 387 371 unsigned long state; 388 372 int weight; 389 373 u32 defer_hard_irqs_count; 390 - unsigned long gro_bitmask; 391 374 int (*poll)(struct napi_struct *, int); 392 375 #ifdef CONFIG_NETPOLL 393 376 /* CPU actively polling if netpoll is configured */ ··· 395 380 /* CPU on which NAPI has been scheduled for processing */ 396 381 int list_owner; 397 382 struct net_device *dev; 398 - struct gro_list gro_hash[GRO_HASH_BUCKETS]; 399 383 struct sk_buff *skb; 400 - struct list_head rx_list; /* Pending GRO_NORMAL skbs */ 401 - int rx_count; /* length of rx_list */ 402 - unsigned int napi_id; /* protected by netdev_lock */ 384 + struct gro_node gro; 403 385 struct hrtimer timer; 404 386 /* all fields past this point are write-protected by netdev_lock */ 405 387 struct task_struct *thread; ··· 404 392 unsigned long irq_suspend_timeout; 405 393 u32 defer_hard_irqs; 406 394 /* control-path-only fields follow */ 395 + u32 napi_id; 407 396 struct list_head dev_list; 408 397 struct hlist_node napi_hash_node; 409 398 int irq; ··· 4144 4131 int netif_receive_skb_core(struct sk_buff *skb); 4145 4132 void netif_receive_skb_list_internal(struct list_head *head); 4146 4133 void netif_receive_skb_list(struct list_head *head); 4147 - gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); 4148 - void napi_gro_flush(struct napi_struct *napi, bool flush_old); 4134 + gro_result_t gro_receive_skb(struct gro_node *gro, struct sk_buff *skb); 4135 + 4136 + static inline gro_result_t napi_gro_receive(struct napi_struct *napi, 4137 + struct sk_buff *skb) 4138 + { 4139 + return gro_receive_skb(&napi->gro, skb); 4140 + } 4141 + 4149 4142 struct sk_buff *napi_get_frags(struct napi_struct *napi); 4150 4143 gro_result_t napi_gro_frags(struct napi_struct *napi); 4151 4144
+1
include/linux/skbuff.h
··· 1320 1320 void *data, unsigned int frag_size); 1321 1321 void skb_attempt_defer_free(struct sk_buff *skb); 1322 1322 1323 + u32 napi_skb_cache_get_bulk(void **skbs, u32 n); 1323 1324 struct sk_buff *napi_build_skb(void *data, unsigned int frag_size); 1324 1325 struct sk_buff *slab_build_skb(void *data); 1325 1326
+9 -3
include/net/busy_poll.h
··· 127 127 } 128 128 129 129 /* used in the NIC receive handler to mark the skb */ 130 - static inline void skb_mark_napi_id(struct sk_buff *skb, 131 - struct napi_struct *napi) 130 + static inline void __skb_mark_napi_id(struct sk_buff *skb, 131 + const struct gro_node *gro) 132 132 { 133 133 #ifdef CONFIG_NET_RX_BUSY_POLL 134 134 /* If the skb was already marked with a valid NAPI ID, avoid overwriting 135 135 * it. 136 136 */ 137 137 if (!napi_id_valid(skb->napi_id)) 138 - skb->napi_id = napi->napi_id; 138 + skb->napi_id = gro->cached_napi_id; 139 139 #endif 140 + } 141 + 142 + static inline void skb_mark_napi_id(struct sk_buff *skb, 143 + const struct napi_struct *napi) 144 + { 145 + __skb_mark_napi_id(skb, &napi->gro); 140 146 } 141 147 142 148 /* used in the protocol handler to propagate the napi_id to the socket */
+28 -10
include/net/gro.h
··· 509 509 510 510 int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); 511 511 int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); 512 + void __gro_flush(struct gro_node *gro, bool flush_old); 513 + 514 + static inline void gro_flush(struct gro_node *gro, bool flush_old) 515 + { 516 + if (!gro->bitmask) 517 + return; 518 + 519 + __gro_flush(gro, flush_old); 520 + } 521 + 522 + static inline void napi_gro_flush(struct napi_struct *napi, bool flush_old) 523 + { 524 + gro_flush(&napi->gro, flush_old); 525 + } 512 526 513 527 /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ 514 - static inline void gro_normal_list(struct napi_struct *napi) 528 + static inline void gro_normal_list(struct gro_node *gro) 515 529 { 516 - if (!napi->rx_count) 530 + if (!gro->rx_count) 517 531 return; 518 - netif_receive_skb_list_internal(&napi->rx_list); 519 - INIT_LIST_HEAD(&napi->rx_list); 520 - napi->rx_count = 0; 532 + netif_receive_skb_list_internal(&gro->rx_list); 533 + INIT_LIST_HEAD(&gro->rx_list); 534 + gro->rx_count = 0; 521 535 } 522 536 523 537 /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, 524 538 * pass the whole batch up to the stack. 525 539 */ 526 - static inline void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs) 540 + static inline void gro_normal_one(struct gro_node *gro, struct sk_buff *skb, 541 + int segs) 527 542 { 528 - list_add_tail(&skb->list, &napi->rx_list); 529 - napi->rx_count += segs; 530 - if (napi->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch)) 531 - gro_normal_list(napi); 543 + list_add_tail(&skb->list, &gro->rx_list); 544 + gro->rx_count += segs; 545 + if (gro->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch)) 546 + gro_normal_list(gro); 532 547 } 548 + 549 + void gro_init(struct gro_node *gro); 550 + void gro_cleanup(struct gro_node *gro); 533 551 534 552 /* This function is the alternative of 'inet_iif' and 'inet_sdif' 535 553 * functions in case we can not rely on fields of IPCB.
-1
include/net/xdp.h
··· 343 343 struct net_device *dev); 344 344 struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, 345 345 struct net_device *dev); 346 - int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp); 347 346 struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); 348 347 349 348 static inline
+95 -55
kernel/bpf/cpumap.c
··· 33 33 #include <trace/events/xdp.h> 34 34 #include <linux/btf_ids.h> 35 35 36 - #include <linux/netdevice.h> /* netif_receive_skb_list */ 37 - #include <linux/etherdevice.h> /* eth_type_trans */ 36 + #include <linux/netdevice.h> 37 + #include <net/gro.h> 38 38 39 39 /* General idea: XDP packets getting XDP redirected to another CPU, 40 40 * will maximum be stored/queued for one driver ->poll() call. It is ··· 68 68 69 69 struct bpf_cpumap_val value; 70 70 struct bpf_prog *prog; 71 + struct gro_node gro; 71 72 72 73 struct completion kthread_running; 73 74 struct rcu_work free_work; ··· 134 133 } 135 134 } 136 135 137 - static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, 138 - struct list_head *listp, 139 - struct xdp_cpumap_stats *stats) 136 + static u32 cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, 137 + void **skbs, u32 skb_n, 138 + struct xdp_cpumap_stats *stats) 140 139 { 141 - struct sk_buff *skb, *tmp; 142 140 struct xdp_buff xdp; 143 - u32 act; 141 + u32 act, pass = 0; 144 142 int err; 145 143 146 - list_for_each_entry_safe(skb, tmp, listp, list) { 144 + for (u32 i = 0; i < skb_n; i++) { 145 + struct sk_buff *skb = skbs[i]; 146 + 147 147 act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog); 148 148 switch (act) { 149 149 case XDP_PASS: 150 + skbs[pass++] = skb; 150 151 break; 151 152 case XDP_REDIRECT: 152 - skb_list_del_init(skb); 153 153 err = xdp_do_generic_redirect(skb->dev, skb, &xdp, 154 154 rcpu->prog); 155 155 if (unlikely(err)) { ··· 159 157 } else { 160 158 stats->redirect++; 161 159 } 162 - return; 160 + break; 163 161 default: 164 162 bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); 165 163 fallthrough; ··· 167 165 trace_xdp_exception(skb->dev, rcpu->prog, act); 168 166 fallthrough; 169 167 case XDP_DROP: 170 - skb_list_del_init(skb); 171 - kfree_skb(skb); 168 + napi_consume_skb(skb, true); 172 169 stats->drop++; 173 - return; 170 + break; 174 171 } 175 172 } 173 + 174 + stats->pass += pass; 175 + 176 + return pass; 176 177 } 177 178 178 179 static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, ··· 209 204 stats->drop++; 210 205 } else { 211 206 frames[nframes++] = xdpf; 212 - stats->pass++; 213 207 } 214 208 break; 215 209 case XDP_REDIRECT: ··· 232 228 } 233 229 234 230 xdp_clear_return_frame_no_direct(); 231 + stats->pass += nframes; 235 232 236 233 return nframes; 237 234 } 238 235 239 236 #define CPUMAP_BATCH 8 240 237 241 - static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, 242 - int xdp_n, struct xdp_cpumap_stats *stats, 243 - struct list_head *list) 238 + struct cpu_map_ret { 239 + u32 xdp_n; 240 + u32 skb_n; 241 + }; 242 + 243 + static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, 244 + void **skbs, struct cpu_map_ret *ret, 245 + struct xdp_cpumap_stats *stats) 244 246 { 245 247 struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; 246 - int nframes; 247 248 248 249 if (!rcpu->prog) 249 - return xdp_n; 250 + goto out; 250 251 251 - rcu_read_lock_bh(); 252 + rcu_read_lock(); 252 253 bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); 253 254 254 - nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats); 255 + ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats); 256 + if (unlikely(ret->skb_n)) 257 + ret->skb_n = cpu_map_bpf_prog_run_skb(rcpu, skbs, ret->skb_n, 258 + stats); 255 259 256 260 if (stats->redirect) 257 261 xdp_do_flush(); 258 262 259 - if (unlikely(!list_empty(list))) 260 - cpu_map_bpf_prog_run_skb(rcpu, list, stats); 261 - 262 263 bpf_net_ctx_clear(bpf_net_ctx); 263 - rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ 264 + rcu_read_unlock(); 264 265 265 - return nframes; 266 + out: 267 + if (unlikely(ret->skb_n) && ret->xdp_n) 268 + memmove(&skbs[ret->xdp_n], skbs, ret->skb_n * sizeof(*skbs)); 269 + } 270 + 271 + static void cpu_map_gro_flush(struct bpf_cpu_map_entry *rcpu, bool empty) 272 + { 273 + /* 274 + * If the ring is not empty, there'll be a new iteration soon, and we 275 + * only need to do a full flush if a tick is long (> 1 ms). 276 + * If the ring is empty, to not hold GRO packets in the stack for too 277 + * long, do a full flush. 278 + * This is equivalent to how NAPI decides whether to perform a full 279 + * flush. 280 + */ 281 + gro_flush(&rcpu->gro, !empty && HZ >= 1000); 282 + gro_normal_list(&rcpu->gro); 266 283 } 267 284 268 285 static int cpu_map_kthread_run(void *data) 269 286 { 270 287 struct bpf_cpu_map_entry *rcpu = data; 271 288 unsigned long last_qs = jiffies; 289 + u32 packets = 0; 272 290 273 291 complete(&rcpu->kthread_running); 274 292 set_current_state(TASK_INTERRUPTIBLE); ··· 303 277 while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { 304 278 struct xdp_cpumap_stats stats = {}; /* zero stats */ 305 279 unsigned int kmem_alloc_drops = 0, sched = 0; 306 - gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; 307 - int i, n, m, nframes, xdp_n; 280 + struct cpu_map_ret ret = { }; 308 281 void *frames[CPUMAP_BATCH]; 309 282 void *skbs[CPUMAP_BATCH]; 310 - LIST_HEAD(list); 283 + u32 i, n, m; 284 + bool empty; 311 285 312 286 /* Release CPU reschedule checks */ 313 287 if (__ptr_ring_empty(rcpu->queue)) { ··· 332 306 */ 333 307 n = __ptr_ring_consume_batched(rcpu->queue, frames, 334 308 CPUMAP_BATCH); 335 - for (i = 0, xdp_n = 0; i < n; i++) { 309 + for (i = 0; i < n; i++) { 336 310 void *f = frames[i]; 337 311 struct page *page; 338 312 ··· 340 314 struct sk_buff *skb = f; 341 315 342 316 __ptr_clear_bit(0, &skb); 343 - list_add_tail(&skb->list, &list); 317 + skbs[ret.skb_n++] = skb; 344 318 continue; 345 319 } 346 320 347 - frames[xdp_n++] = f; 321 + frames[ret.xdp_n++] = f; 348 322 page = virt_to_page(f); 349 323 350 324 /* Bring struct page memory area to curr CPU. Read by ··· 354 328 prefetchw(page); 355 329 } 356 330 357 - /* Support running another XDP prog on this CPU */ 358 - nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list); 359 - if (nframes) { 360 - m = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, 361 - gfp, nframes, skbs); 362 - if (unlikely(m == 0)) { 363 - for (i = 0; i < nframes; i++) 364 - skbs[i] = NULL; /* effect: xdp_return_frame */ 365 - kmem_alloc_drops += nframes; 366 - } 367 - } 368 - 369 331 local_bh_disable(); 370 - for (i = 0; i < nframes; i++) { 371 - struct xdp_frame *xdpf = frames[i]; 372 - struct sk_buff *skb = skbs[i]; 373 332 374 - skb = __xdp_build_skb_from_frame(xdpf, skb, 375 - xdpf->dev_rx); 376 - if (!skb) { 377 - xdp_return_frame(xdpf); 378 - continue; 379 - } 333 + /* Support running another XDP prog on this CPU */ 334 + cpu_map_bpf_prog_run(rcpu, frames, skbs, &ret, &stats); 335 + if (!ret.xdp_n) 336 + goto stats; 380 337 381 - list_add_tail(&skb->list, &list); 338 + m = napi_skb_cache_get_bulk(skbs, ret.xdp_n); 339 + if (unlikely(m < ret.xdp_n)) { 340 + for (i = m; i < ret.xdp_n; i++) 341 + xdp_return_frame(frames[i]); 342 + 343 + if (ret.skb_n) 344 + memmove(&skbs[m], &skbs[ret.xdp_n], 345 + ret.skb_n * sizeof(*skbs)); 346 + 347 + kmem_alloc_drops += ret.xdp_n - m; 348 + ret.xdp_n = m; 382 349 } 383 350 351 + for (i = 0; i < ret.xdp_n; i++) { 352 + struct xdp_frame *xdpf = frames[i]; 353 + 354 + /* Can fail only when !skb -- already handled above */ 355 + __xdp_build_skb_from_frame(xdpf, skbs[i], xdpf->dev_rx); 356 + } 357 + 358 + stats: 384 359 /* Feedback loop via tracepoint. 385 360 * NB: keep before recv to allow measuring enqueue/dequeue latency. 386 361 */ 387 362 trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops, 388 363 sched, &stats); 389 364 390 - netif_receive_skb_list(&list); 365 + for (i = 0; i < ret.xdp_n + ret.skb_n; i++) 366 + gro_receive_skb(&rcpu->gro, skbs[i]); 367 + 368 + /* Flush either every 64 packets or in case of empty ring */ 369 + packets += n; 370 + empty = __ptr_ring_empty(rcpu->queue); 371 + if (packets >= NAPI_POLL_WEIGHT || empty) { 372 + cpu_map_gro_flush(rcpu, empty); 373 + packets = 0; 374 + } 375 + 391 376 local_bh_enable(); /* resched point, may call do_softirq() */ 392 377 } 393 378 __set_current_state(TASK_RUNNING); ··· 467 430 rcpu->cpu = cpu; 468 431 rcpu->map_id = map->id; 469 432 rcpu->value.qsize = value->qsize; 433 + gro_init(&rcpu->gro); 470 434 471 435 if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd)) 472 436 goto free_ptr_ring; ··· 496 458 if (rcpu->prog) 497 459 bpf_prog_put(rcpu->prog); 498 460 free_ptr_ring: 461 + gro_cleanup(&rcpu->gro); 499 462 ptr_ring_cleanup(rcpu->queue, NULL); 500 463 free_queue: 501 464 kfree(rcpu->queue); ··· 526 487 527 488 if (rcpu->prog) 528 489 bpf_prog_put(rcpu->prog); 490 + gro_cleanup(&rcpu->gro); 529 491 /* The queue should be empty at this point */ 530 492 __cpu_map_ring_cleanup(rcpu->queue); 531 493 ptr_ring_cleanup(rcpu->queue, NULL);
+22 -57
net/core/dev.c
··· 6484 6484 return false; 6485 6485 6486 6486 if (work_done) { 6487 - if (n->gro_bitmask) 6487 + if (n->gro.bitmask) 6488 6488 timeout = napi_get_gro_flush_timeout(n); 6489 6489 n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n); 6490 6490 } ··· 6494 6494 if (timeout) 6495 6495 ret = false; 6496 6496 } 6497 - if (n->gro_bitmask) { 6498 - /* When the NAPI instance uses a timeout and keeps postponing 6499 - * it, we need to bound somehow the time packets are kept in 6500 - * the GRO layer 6501 - */ 6502 - napi_gro_flush(n, !!timeout); 6503 - } 6504 6497 6505 - gro_normal_list(n); 6498 + /* 6499 + * When the NAPI instance uses a timeout and keeps postponing 6500 + * it, we need to bound somehow the time packets are kept in 6501 + * the GRO layer. 6502 + */ 6503 + gro_flush(&n->gro, !!timeout); 6504 + gro_normal_list(&n->gro); 6506 6505 6507 6506 if (unlikely(!list_empty(&n->poll_list))) { 6508 6507 /* If n->poll_list is not empty, we need to mask irqs */ ··· 6565 6566 static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) 6566 6567 { 6567 6568 if (!skip_schedule) { 6568 - gro_normal_list(napi); 6569 + gro_normal_list(&napi->gro); 6569 6570 __napi_schedule(napi); 6570 6571 return; 6571 6572 } 6572 6573 6573 - if (napi->gro_bitmask) { 6574 - /* flush too old packets 6575 - * If HZ < 1000, flush all packets. 6576 - */ 6577 - napi_gro_flush(napi, HZ >= 1000); 6578 - } 6574 + /* Flush too old packets. If HZ < 1000, flush all packets */ 6575 + gro_flush(&napi->gro, HZ >= 1000); 6576 + gro_normal_list(&napi->gro); 6579 6577 6580 - gro_normal_list(napi); 6581 6578 clear_bit(NAPI_STATE_SCHED, &napi->state); 6582 6579 } 6583 6580 ··· 6680 6685 } 6681 6686 work = napi_poll(napi, budget); 6682 6687 trace_napi_poll(napi, work, budget); 6683 - gro_normal_list(napi); 6688 + gro_normal_list(&napi->gro); 6684 6689 count: 6685 6690 if (work > 0) 6686 6691 __NET_ADD_STATS(dev_net(napi->dev), ··· 6780 6785 static void __napi_hash_add_with_id(struct napi_struct *napi, 6781 6786 unsigned int napi_id) 6782 6787 { 6788 + napi->gro.cached_napi_id = napi_id; 6789 + 6783 6790 WRITE_ONCE(napi->napi_id, napi_id); 6784 6791 hlist_add_head_rcu(&napi->napi_hash_node, 6785 6792 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); ··· 6848 6851 } 6849 6852 6850 6853 return HRTIMER_NORESTART; 6851 - } 6852 - 6853 - static void init_gro_hash(struct napi_struct *napi) 6854 - { 6855 - int i; 6856 - 6857 - for (i = 0; i < GRO_HASH_BUCKETS; i++) { 6858 - INIT_LIST_HEAD(&napi->gro_hash[i].list); 6859 - napi->gro_hash[i].count = 0; 6860 - } 6861 - napi->gro_bitmask = 0; 6862 6854 } 6863 6855 6864 6856 int dev_set_threaded(struct net_device *dev, bool threaded) ··· 7177 7191 INIT_HLIST_NODE(&napi->napi_hash_node); 7178 7192 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); 7179 7193 napi->timer.function = napi_watchdog; 7180 - init_gro_hash(napi); 7194 + gro_init(&napi->gro); 7181 7195 napi->skb = NULL; 7182 - INIT_LIST_HEAD(&napi->rx_list); 7183 - napi->rx_count = 0; 7184 7196 napi->poll = poll; 7185 7197 if (weight > NAPI_POLL_WEIGHT) 7186 7198 netdev_err_once(dev, "%s() called with weight %d\n", __func__, ··· 7292 7308 } 7293 7309 EXPORT_SYMBOL(napi_enable); 7294 7310 7295 - static void flush_gro_hash(struct napi_struct *napi) 7296 - { 7297 - int i; 7298 - 7299 - for (i = 0; i < GRO_HASH_BUCKETS; i++) { 7300 - struct sk_buff *skb, *n; 7301 - 7302 - list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list) 7303 - kfree_skb(skb); 7304 - napi->gro_hash[i].count = 0; 7305 - } 7306 - } 7307 - 7308 7311 /* Must be called in process context */ 7309 7312 void __netif_napi_del_locked(struct napi_struct *napi) 7310 7313 { ··· 7314 7343 list_del_rcu(&napi->dev_list); 7315 7344 napi_free_frags(napi); 7316 7345 7317 - flush_gro_hash(napi); 7318 - napi->gro_bitmask = 0; 7346 + gro_cleanup(&napi->gro); 7319 7347 7320 7348 if (napi->thread) { 7321 7349 kthread_stop(napi->thread); ··· 7373 7403 return work; 7374 7404 } 7375 7405 7376 - if (n->gro_bitmask) { 7377 - /* flush too old packets 7378 - * If HZ < 1000, flush all packets. 7379 - */ 7380 - napi_gro_flush(n, HZ >= 1000); 7381 - } 7382 - 7383 - gro_normal_list(n); 7406 + /* Flush too old packets. If HZ < 1000, flush all packets */ 7407 + gro_flush(&n->gro, HZ >= 1000); 7408 + gro_normal_list(&n->gro); 7384 7409 7385 7410 /* Some drivers may have called napi_schedule 7386 7411 * prior to exhausting their budget. ··· 12404 12439 static int __net_init netdev_init(struct net *net) 12405 12440 { 12406 12441 BUILD_BUG_ON(GRO_HASH_BUCKETS > 12407 - 8 * sizeof_field(struct napi_struct, gro_bitmask)); 12442 + BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask)); 12408 12443 12409 12444 INIT_LIST_HEAD(&net->dev_base_head); 12410 12445 ··· 12769 12804 INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); 12770 12805 spin_lock_init(&sd->defer_lock); 12771 12806 12772 - init_gro_hash(&sd->backlog); 12807 + gro_init(&sd->backlog.gro); 12773 12808 sd->backlog.poll = process_backlog; 12774 12809 sd->backlog.weight = weight_p; 12775 12810 INIT_LIST_HEAD(&sd->backlog.poll_list);
+68 -35
net/core/gro.c
··· 250 250 return 0; 251 251 } 252 252 253 - 254 - static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) 253 + static void gro_complete(struct gro_node *gro, struct sk_buff *skb) 255 254 { 256 255 struct list_head *head = &net_hotdata.offload_base; 257 256 struct packet_offload *ptype; ··· 283 284 } 284 285 285 286 out: 286 - gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); 287 + gro_normal_one(gro, skb, NAPI_GRO_CB(skb)->count); 287 288 } 288 289 289 - static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, 290 - bool flush_old) 290 + static void __gro_flush_chain(struct gro_node *gro, u32 index, bool flush_old) 291 291 { 292 - struct list_head *head = &napi->gro_hash[index].list; 292 + struct list_head *head = &gro->hash[index].list; 293 293 struct sk_buff *skb, *p; 294 294 295 295 list_for_each_entry_safe_reverse(skb, p, head, list) { 296 296 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) 297 297 return; 298 298 skb_list_del_init(skb); 299 - napi_gro_complete(napi, skb); 300 - napi->gro_hash[index].count--; 299 + gro_complete(gro, skb); 300 + gro->hash[index].count--; 301 301 } 302 302 303 - if (!napi->gro_hash[index].count) 304 - __clear_bit(index, &napi->gro_bitmask); 303 + if (!gro->hash[index].count) 304 + __clear_bit(index, &gro->bitmask); 305 305 } 306 306 307 - /* napi->gro_hash[].list contains packets ordered by age. 307 + /* 308 + * gro->hash[].list contains packets ordered by age. 308 309 * youngest packets at the head of it. 309 310 * Complete skbs in reverse order to reduce latencies. 310 311 */ 311 - void napi_gro_flush(struct napi_struct *napi, bool flush_old) 312 + void __gro_flush(struct gro_node *gro, bool flush_old) 312 313 { 313 - unsigned long bitmask = napi->gro_bitmask; 314 + unsigned long bitmask = gro->bitmask; 314 315 unsigned int i, base = ~0U; 315 316 316 317 while ((i = ffs(bitmask)) != 0) { 317 318 bitmask >>= i; 318 319 base += i; 319 - __napi_gro_flush_chain(napi, base, flush_old); 320 + __gro_flush_chain(gro, base, flush_old); 320 321 } 321 322 } 322 - EXPORT_SYMBOL(napi_gro_flush); 323 + EXPORT_SYMBOL(__gro_flush); 323 324 324 325 static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb, 325 326 const struct sk_buff *p, ··· 438 439 gro_pull_from_frag0(skb, grow); 439 440 } 440 441 441 - static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) 442 + static void gro_flush_oldest(struct gro_node *gro, struct list_head *head) 442 443 { 443 444 struct sk_buff *oldest; 444 445 ··· 454 455 * SKB to the chain. 455 456 */ 456 457 skb_list_del_init(oldest); 457 - napi_gro_complete(napi, oldest); 458 + gro_complete(gro, oldest); 458 459 } 459 460 460 - static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 461 + static enum gro_result dev_gro_receive(struct gro_node *gro, 462 + struct sk_buff *skb) 461 463 { 462 464 u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); 463 - struct gro_list *gro_list = &napi->gro_hash[bucket]; 464 465 struct list_head *head = &net_hotdata.offload_base; 466 + struct gro_list *gro_list = &gro->hash[bucket]; 465 467 struct packet_offload *ptype; 466 468 __be16 type = skb->protocol; 467 469 struct sk_buff *pp = NULL; ··· 526 526 527 527 if (pp) { 528 528 skb_list_del_init(pp); 529 - napi_gro_complete(napi, pp); 529 + gro_complete(gro, pp); 530 530 gro_list->count--; 531 531 } 532 532 ··· 537 537 goto normal; 538 538 539 539 if (unlikely(gro_list->count >= MAX_GRO_SKBS)) 540 - gro_flush_oldest(napi, &gro_list->list); 540 + gro_flush_oldest(gro, &gro_list->list); 541 541 else 542 542 gro_list->count++; 543 543 ··· 551 551 ret = GRO_HELD; 552 552 ok: 553 553 if (gro_list->count) { 554 - if (!test_bit(bucket, &napi->gro_bitmask)) 555 - __set_bit(bucket, &napi->gro_bitmask); 556 - } else if (test_bit(bucket, &napi->gro_bitmask)) { 557 - __clear_bit(bucket, &napi->gro_bitmask); 554 + if (!test_bit(bucket, &gro->bitmask)) 555 + __set_bit(bucket, &gro->bitmask); 556 + } else if (test_bit(bucket, &gro->bitmask)) { 557 + __clear_bit(bucket, &gro->bitmask); 558 558 } 559 559 560 560 return ret; ··· 593 593 } 594 594 EXPORT_SYMBOL(gro_find_complete_by_type); 595 595 596 - static gro_result_t napi_skb_finish(struct napi_struct *napi, 597 - struct sk_buff *skb, 598 - gro_result_t ret) 596 + static gro_result_t gro_skb_finish(struct gro_node *gro, struct sk_buff *skb, 597 + gro_result_t ret) 599 598 { 600 599 switch (ret) { 601 600 case GRO_NORMAL: 602 - gro_normal_one(napi, skb, 1); 601 + gro_normal_one(gro, skb, 1); 603 602 break; 604 603 605 604 case GRO_MERGED_FREE: ··· 619 620 return ret; 620 621 } 621 622 622 - gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 623 + gro_result_t gro_receive_skb(struct gro_node *gro, struct sk_buff *skb) 623 624 { 624 625 gro_result_t ret; 625 626 626 - skb_mark_napi_id(skb, napi); 627 + __skb_mark_napi_id(skb, gro); 627 628 trace_napi_gro_receive_entry(skb); 628 629 629 630 skb_gro_reset_offset(skb, 0); 630 631 631 - ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); 632 + ret = gro_skb_finish(gro, skb, dev_gro_receive(gro, skb)); 632 633 trace_napi_gro_receive_exit(ret); 633 634 634 635 return ret; 635 636 } 636 - EXPORT_SYMBOL(napi_gro_receive); 637 + EXPORT_SYMBOL(gro_receive_skb); 637 638 638 639 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) 639 640 { ··· 689 690 __skb_push(skb, ETH_HLEN); 690 691 skb->protocol = eth_type_trans(skb, skb->dev); 691 692 if (ret == GRO_NORMAL) 692 - gro_normal_one(napi, skb, 1); 693 + gro_normal_one(&napi->gro, skb, 1); 693 694 break; 694 695 695 696 case GRO_MERGED_FREE: ··· 758 759 759 760 trace_napi_gro_frags_entry(skb); 760 761 761 - ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); 762 + ret = napi_frags_finish(napi, skb, dev_gro_receive(&napi->gro, skb)); 762 763 trace_napi_gro_frags_exit(ret); 763 764 764 765 return ret; ··· 790 791 return sum; 791 792 } 792 793 EXPORT_SYMBOL(__skb_gro_checksum_complete); 794 + 795 + void gro_init(struct gro_node *gro) 796 + { 797 + for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) { 798 + INIT_LIST_HEAD(&gro->hash[i].list); 799 + gro->hash[i].count = 0; 800 + } 801 + 802 + gro->bitmask = 0; 803 + gro->cached_napi_id = 0; 804 + 805 + INIT_LIST_HEAD(&gro->rx_list); 806 + gro->rx_count = 0; 807 + } 808 + 809 + void gro_cleanup(struct gro_node *gro) 810 + { 811 + struct sk_buff *skb, *n; 812 + 813 + for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) { 814 + list_for_each_entry_safe(skb, n, &gro->hash[i].list, list) 815 + kfree_skb(skb); 816 + 817 + gro->hash[i].count = 0; 818 + } 819 + 820 + gro->bitmask = 0; 821 + gro->cached_napi_id = 0; 822 + 823 + list_for_each_entry_safe(skb, n, &gro->rx_list, list) 824 + kfree_skb(skb); 825 + 826 + gro->rx_count = 0; 827 + }
+62
net/core/skbuff.c
··· 295 295 return skb; 296 296 } 297 297 298 + /** 299 + * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache 300 + * @skbs: pointer to an at least @n-sized array to fill with skb pointers 301 + * @n: number of entries to provide 302 + * 303 + * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes 304 + * the pointers into the provided array @skbs. If there are less entries 305 + * available, tries to replenish the cache and bulk-allocates the diff from 306 + * the MM layer if needed. 307 + * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are 308 + * ready for {,__}build_skb_around() and don't have any data buffers attached. 309 + * Must be called *only* from the BH context. 310 + * 311 + * Return: number of successfully allocated skbs (@n if no actual allocation 312 + * needed or kmem_cache_alloc_bulk() didn't fail). 313 + */ 314 + u32 napi_skb_cache_get_bulk(void **skbs, u32 n) 315 + { 316 + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 317 + u32 bulk, total = n; 318 + 319 + local_lock_nested_bh(&napi_alloc_cache.bh_lock); 320 + 321 + if (nc->skb_count >= n) 322 + goto get; 323 + 324 + /* No enough cached skbs. Try refilling the cache first */ 325 + bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK); 326 + nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, 327 + GFP_ATOMIC | __GFP_NOWARN, bulk, 328 + &nc->skb_cache[nc->skb_count]); 329 + if (likely(nc->skb_count >= n)) 330 + goto get; 331 + 332 + /* Still not enough. Bulk-allocate the missing part directly, zeroed */ 333 + n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, 334 + GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN, 335 + n - nc->skb_count, &skbs[nc->skb_count]); 336 + if (likely(nc->skb_count >= n)) 337 + goto get; 338 + 339 + /* kmem_cache didn't allocate the number we need, limit the output */ 340 + total -= n - nc->skb_count; 341 + n = nc->skb_count; 342 + 343 + get: 344 + for (u32 base = nc->skb_count - n, i = 0; i < n; i++) { 345 + u32 cache_size = kmem_cache_size(net_hotdata.skbuff_cache); 346 + 347 + skbs[i] = nc->skb_cache[base + i]; 348 + 349 + kasan_mempool_unpoison_object(skbs[i], cache_size); 350 + memset(skbs[i], 0, offsetof(struct sk_buff, tail)); 351 + } 352 + 353 + nc->skb_count -= n; 354 + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); 355 + 356 + return total; 357 + } 358 + EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk); 359 + 298 360 static inline void __finalize_skb_around(struct sk_buff *skb, void *data, 299 361 unsigned int size) 300 362 {
-10
net/core/xdp.c
··· 618 618 }; 619 619 EXPORT_SYMBOL_GPL(xdp_warn); 620 620 621 - int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp) 622 - { 623 - n_skb = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, n_skb, skbs); 624 - if (unlikely(!n_skb)) 625 - return -ENOMEM; 626 - 627 - return 0; 628 - } 629 - EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk); 630 - 631 621 /** 632 622 * xdp_build_skb_from_buff - create an skb from &xdp_buff 633 623 * @xdp: &xdp_buff to convert to an skb