Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

net: track pfmemalloc drops via SKB_DROP_REASON_PFMEMALLOC

Add a new SKB drop reason (SKB_DROP_REASON_PFMEMALLOC) to track packets
dropped due to memory pressure. In production environments, we've observed
memory exhaustion reported by memory layer stack traces, but these drops
were not properly tracked in the SKB drop reason infrastructure.

While most network code paths now properly report pfmemalloc drops, some
protocol-specific socket implementations still use sk_filter() without
drop reason tracking:
- Bluetooth L2CAP sockets
- CAIF sockets
- IUCV sockets
- Netlink sockets
- SCTP sockets
- Unix domain sockets

These remaining cases represent less common paths and could be converted
in a follow-up patch if needed. The current implementation provides
significantly improved observability into memory pressure events in the
network stack, especially for key protocols like TCP and UDP, helping to
diagnose problems in production environments.

Reported-by: Matt Fleming <mfleming@cloudflare.com>
Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
Link: https://patch.msgid.link/175268316579.2407873.11634752355644843509.stgit@firesoul
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Jesper Dangaard Brouer and committed by
Jakub Kicinski
a6f19063 8b7ab8eb

+75 -44
+2 -4
drivers/net/tun.c
··· 1002 1002 /* Net device start xmit */ 1003 1003 static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) 1004 1004 { 1005 + enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 1005 1006 struct tun_struct *tun = netdev_priv(dev); 1006 - enum skb_drop_reason drop_reason; 1007 1007 int txq = skb->queue_mapping; 1008 1008 struct netdev_queue *queue; 1009 1009 struct tun_file *tfile; ··· 1032 1032 } 1033 1033 1034 1034 if (tfile->socket.sk->sk_filter && 1035 - sk_filter(tfile->socket.sk, skb)) { 1036 - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 1035 + sk_filter_reason(tfile->socket.sk, skb, &drop_reason)) 1037 1036 goto drop; 1038 - } 1039 1037 1040 1038 len = run_ebpf_filter(tun, skb, len); 1041 1039 if (len == 0) {
+12 -2
include/linux/filter.h
··· 1073 1073 return set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT); 1074 1074 } 1075 1075 1076 - int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); 1076 + int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap, 1077 + enum skb_drop_reason *reason); 1078 + 1077 1079 static inline int sk_filter(struct sock *sk, struct sk_buff *skb) 1078 1080 { 1079 - return sk_filter_trim_cap(sk, skb, 1); 1081 + enum skb_drop_reason ignore_reason; 1082 + 1083 + return sk_filter_trim_cap(sk, skb, 1, &ignore_reason); 1084 + } 1085 + 1086 + static inline int sk_filter_reason(struct sock *sk, struct sk_buff *skb, 1087 + enum skb_drop_reason *reason) 1088 + { 1089 + return sk_filter_trim_cap(sk, skb, 1, reason); 1080 1090 } 1081 1091 1082 1092 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err);
+6
include/net/dropreason-core.h
··· 125 125 FN(CAN_RX_INVALID_FRAME) \ 126 126 FN(CANFD_RX_INVALID_FRAME) \ 127 127 FN(CANXL_RX_INVALID_FRAME) \ 128 + FN(PFMEMALLOC) \ 128 129 FNe(MAX) 129 130 130 131 /** ··· 599 598 * non conform CAN-XL frame (or device is unable to receive CAN frames) 600 599 */ 601 600 SKB_DROP_REASON_CANXL_RX_INVALID_FRAME, 601 + /** 602 + * @SKB_DROP_REASON_PFMEMALLOC: packet allocated from memory reserve 603 + * reached a path or socket not eligible for use of memory reserves 604 + */ 605 + SKB_DROP_REASON_PFMEMALLOC, 602 606 /** 603 607 * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which 604 608 * shouldn't be used as a real 'reason' - only for tracing code gen
+1 -1
include/net/tcp.h
··· 1559 1559 enum skb_drop_reason *reason); 1560 1560 1561 1561 1562 - int tcp_filter(struct sock *sk, struct sk_buff *skb); 1562 + int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason); 1563 1563 void tcp_set_state(struct sock *sk, int state); 1564 1564 void tcp_done(struct sock *sk); 1565 1565 int tcp_abort(struct sock *sk, int err);
+6 -2
net/core/dev.c
··· 5749 5749 static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, 5750 5750 struct packet_type **ppt_prev) 5751 5751 { 5752 + enum skb_drop_reason drop_reason = SKB_DROP_REASON_UNHANDLED_PROTO; 5752 5753 struct packet_type *ptype, *pt_prev; 5753 5754 rx_handler_func_t *rx_handler; 5754 5755 struct sk_buff *skb = *pskb; ··· 5841 5840 #endif 5842 5841 skb_reset_redirect(skb); 5843 5842 skip_classify: 5844 - if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) 5843 + if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) { 5844 + drop_reason = SKB_DROP_REASON_PFMEMALLOC; 5845 5845 goto drop; 5846 + } 5846 5847 5847 5848 if (skb_vlan_tag_present(skb)) { 5848 5849 if (pt_prev) { ··· 5949 5946 dev_core_stats_rx_dropped_inc(skb->dev); 5950 5947 else 5951 5948 dev_core_stats_rx_nohandler_inc(skb->dev); 5952 - kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); 5949 + 5950 + kfree_skb_reason(skb, drop_reason); 5953 5951 /* Jamal, now you will not able to escape explaining 5954 5952 * me how you were going to use this. :-) 5955 5953 */
+12 -3
net/core/filter.c
··· 122 122 * @sk: sock associated with &sk_buff 123 123 * @skb: buffer to filter 124 124 * @cap: limit on how short the eBPF program may trim the packet 125 + * @reason: record drop reason on errors (negative return value) 125 126 * 126 127 * Run the eBPF program and then cut skb->data to correct size returned by 127 128 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller ··· 131 130 * be accepted or -EPERM if the packet should be tossed. 132 131 * 133 132 */ 134 - int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) 133 + int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, 134 + unsigned int cap, enum skb_drop_reason *reason) 135 135 { 136 136 int err; 137 137 struct sk_filter *filter; ··· 144 142 */ 145 143 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { 146 144 NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); 145 + *reason = SKB_DROP_REASON_PFMEMALLOC; 147 146 return -ENOMEM; 148 147 } 149 148 err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); 150 - if (err) 149 + if (err) { 150 + *reason = SKB_DROP_REASON_SOCKET_FILTER; 151 151 return err; 152 + } 152 153 153 154 err = security_sock_rcv_skb(sk, skb); 154 - if (err) 155 + if (err) { 156 + *reason = SKB_DROP_REASON_SECURITY_HOOK; 155 157 return err; 158 + } 156 159 157 160 rcu_read_lock(); 158 161 filter = rcu_dereference(sk->sk_filter); ··· 169 162 pkt_len = bpf_prog_run_save_cb(filter->prog, skb); 170 163 skb->sk = save_sk; 171 164 err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; 165 + if (err) 166 + *reason = SKB_DROP_REASON_SOCKET_FILTER; 172 167 } 173 168 rcu_read_unlock(); 174 169
+13 -7
net/core/sock.c
··· 526 526 enum skb_drop_reason drop_reason; 527 527 int err; 528 528 529 - err = sk_filter(sk, skb); 530 - if (err) { 531 - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 529 + err = sk_filter_reason(sk, skb, &drop_reason); 530 + if (err) 532 531 goto out; 533 - } 532 + 534 533 err = __sock_queue_rcv_skb(sk, skb); 535 534 switch (err) { 536 535 case -ENOMEM: ··· 552 553 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 553 554 const int nested, unsigned int trim_cap, bool refcounted) 554 555 { 556 + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; 555 557 int rc = NET_RX_SUCCESS; 558 + int err; 556 559 557 - if (sk_filter_trim_cap(sk, skb, trim_cap)) 560 + if (sk_filter_trim_cap(sk, skb, trim_cap, &reason)) 558 561 goto discard_and_relse; 559 562 560 563 skb->dev = NULL; 561 564 562 565 if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { 563 566 atomic_inc(&sk->sk_drops); 567 + reason = SKB_DROP_REASON_SOCKET_RCVBUFF; 564 568 goto discard_and_relse; 565 569 } 566 570 if (nested) ··· 579 577 rc = sk_backlog_rcv(sk, skb); 580 578 581 579 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 582 - } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { 580 + } else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) { 583 581 bh_unlock_sock(sk); 582 + if (err == -ENOMEM) 583 + reason = SKB_DROP_REASON_PFMEMALLOC; 584 + if (err == -ENOBUFS) 585 + reason = SKB_DROP_REASON_SOCKET_BACKLOG; 584 586 atomic_inc(&sk->sk_drops); 585 587 goto discard_and_relse; 586 588 } ··· 595 589 sock_put(sk); 596 590 return rc; 597 591 discard_and_relse: 598 - kfree_skb(skb); 592 + sk_skb_reason_drop(sk, skb, reason); 599 593 goto out; 600 594 } 601 595 EXPORT_SYMBOL(__sk_receive_skb);
+15 -11
net/ipv4/tcp_ipv4.c
··· 2026 2026 u32 gso_size; 2027 2027 u64 limit; 2028 2028 int delta; 2029 + int err; 2029 2030 2030 2031 /* In case all data was pulled from skb frags (in __pskb_pull_tail()), 2031 2032 * we can fix skb->truesize to its real value to avoid future drops. ··· 2137 2136 2138 2137 limit = min_t(u64, limit, UINT_MAX); 2139 2138 2140 - if (unlikely(sk_add_backlog(sk, skb, limit))) { 2139 + err = sk_add_backlog(sk, skb, limit); 2140 + if (unlikely(err)) { 2141 2141 bh_unlock_sock(sk); 2142 - *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2143 - __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2142 + if (err == -ENOMEM) { 2143 + *reason = SKB_DROP_REASON_PFMEMALLOC; 2144 + __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); 2145 + } else { 2146 + *reason = SKB_DROP_REASON_SOCKET_BACKLOG; 2147 + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); 2148 + } 2144 2149 return true; 2145 2150 } 2146 2151 return false; 2147 2152 } 2148 2153 EXPORT_IPV6_MOD(tcp_add_backlog); 2149 2154 2150 - int tcp_filter(struct sock *sk, struct sk_buff *skb) 2155 + int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) 2151 2156 { 2152 2157 struct tcphdr *th = (struct tcphdr *)skb->data; 2153 2158 2154 - return sk_filter_trim_cap(sk, skb, th->doff * 4); 2159 + return sk_filter_trim_cap(sk, skb, th->doff * 4, reason); 2155 2160 } 2156 2161 EXPORT_IPV6_MOD(tcp_filter); 2157 2162 ··· 2284 2277 } 2285 2278 refcounted = true; 2286 2279 nsk = NULL; 2287 - if (!tcp_filter(sk, skb)) { 2280 + if (!tcp_filter(sk, skb, &drop_reason)) { 2288 2281 th = (const struct tcphdr *)skb->data; 2289 2282 iph = ip_hdr(skb); 2290 2283 tcp_v4_fill_cb(skb, iph, th); 2291 2284 nsk = tcp_check_req(sk, skb, req, false, &req_stolen, 2292 2285 &drop_reason); 2293 - } else { 2294 - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2295 2286 } 2296 2287 if (!nsk) { 2297 2288 reqsk_put(req); ··· 2345 2340 2346 2341 nf_reset_ct(skb); 2347 2342 2348 - if (tcp_filter(sk, skb)) { 2349 - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2343 + if (tcp_filter(sk, skb, &drop_reason)) 2350 2344 goto discard_and_relse; 2351 - } 2345 + 2352 2346 th = (const struct tcphdr *)skb->data; 2353 2347 iph = ip_hdr(skb); 2354 2348 tcp_v4_fill_cb(skb, iph, th);
+2 -4
net/ipv4/udp.c
··· 2347 2347 */ 2348 2348 static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) 2349 2349 { 2350 - int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2350 + enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; 2351 2351 struct udp_sock *up = udp_sk(sk); 2352 2352 int is_udplite = IS_UDPLITE(sk); 2353 2353 ··· 2436 2436 udp_lib_checksum_complete(skb)) 2437 2437 goto csum_error; 2438 2438 2439 - if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) { 2440 - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 2439 + if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason)) 2441 2440 goto drop; 2442 - } 2443 2441 2444 2442 udp_csum_pull_header(skb); 2445 2443
+3 -6
net/ipv6/tcp_ipv6.c
··· 1834 1834 } 1835 1835 refcounted = true; 1836 1836 nsk = NULL; 1837 - if (!tcp_filter(sk, skb)) { 1837 + if (!tcp_filter(sk, skb, &drop_reason)) { 1838 1838 th = (const struct tcphdr *)skb->data; 1839 1839 hdr = ipv6_hdr(skb); 1840 1840 tcp_v6_fill_cb(skb, hdr, th); 1841 1841 nsk = tcp_check_req(sk, skb, req, false, &req_stolen, 1842 1842 &drop_reason); 1843 - } else { 1844 - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 1845 1843 } 1846 1844 if (!nsk) { 1847 1845 reqsk_put(req); ··· 1895 1897 1896 1898 nf_reset_ct(skb); 1897 1899 1898 - if (tcp_filter(sk, skb)) { 1899 - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 1900 + if (tcp_filter(sk, skb, &drop_reason)) 1900 1901 goto discard_and_relse; 1901 - } 1902 + 1902 1903 th = (const struct tcphdr *)skb->data; 1903 1904 hdr = ipv6_hdr(skb); 1904 1905 tcp_v6_fill_cb(skb, hdr, th);
+1 -3
net/ipv6/udp.c
··· 894 894 udp_lib_checksum_complete(skb)) 895 895 goto csum_error; 896 896 897 - if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) { 898 - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 897 + if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason)) 899 898 goto drop; 900 - } 901 899 902 900 udp_csum_pull_header(skb); 903 901
+2 -1
net/rose/rose_in.c
··· 101 101 */ 102 102 static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype, int ns, int nr, int q, int d, int m) 103 103 { 104 + enum skb_drop_reason dr; /* ignored */ 104 105 struct rose_sock *rose = rose_sk(sk); 105 106 int queued = 0; 106 107 ··· 163 162 rose_frames_acked(sk, nr); 164 163 if (ns == rose->vr) { 165 164 rose_start_idletimer(sk); 166 - if (sk_filter_trim_cap(sk, skb, ROSE_MIN_LEN) == 0 && 165 + if (!sk_filter_trim_cap(sk, skb, ROSE_MIN_LEN, &dr) && 167 166 __sock_queue_rcv_skb(sk, skb) == 0) { 168 167 rose->vr = (rose->vr + 1) % ROSE_MODULUS; 169 168 queued = 1;