Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

net: ps3_gelic_net: handle skb allocation failures

Handle skb allocation failures in RX path, to avoid NULL pointer
dereference and RX stalls under memory pressure. If the refill fails
with -ENOMEM, complete napi polling and wake up later to retry via timer.
Also explicitly re-enable RX DMA after oom, so the dmac doesn't remain
stopped in this situation.

Previously, memory pressure could lead to skb allocation failures and
subsequent Oops like:

Oops: Kernel access of bad area, sig: 11 [#2]
Hardware name: SonyPS3 Cell Broadband Engine 0x701000 PS3
NIP [c0003d0000065900] gelic_net_poll+0x6c/0x2d0 [ps3_gelic] (unreliable)
LR [c0003d00000659c4] gelic_net_poll+0x130/0x2d0 [ps3_gelic]
Call Trace:
gelic_net_poll+0x130/0x2d0 [ps3_gelic] (unreliable)
__napi_poll+0x44/0x168
net_rx_action+0x178/0x290

Steps to reproduce the issue:
1. Start a continuous network traffic, like scp of a 20GB file
2. Inject failslab errors using the kernel fault injection:
echo -1 > /sys/kernel/debug/failslab/times
echo 30 > /sys/kernel/debug/failslab/interval
echo 100 > /sys/kernel/debug/failslab/probability
3. After some time, traces start to appear, kernel Oopses
and the system stops

Step 2 is not always necessary, as it is usually already triggered by
the transfer of a big enough file.

Fixes: 02c1889166b4 ("ps3: gigabit ethernet driver for PS3, take3")
Signed-off-by: Florian Fuchs <fuchsfl@gmail.com>
Link: https://patch.msgid.link/20251113181000.3914980-1-fuchsfl@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

authored by

Florian Fuchs and committed by
Paolo Abeni
0f08f0b0 896f1a24

+35 -11
+34 -11
drivers/net/ethernet/toshiba/ps3_gelic_net.c
··· 260 260 if (atomic_dec_if_positive(&card->users) == 0) { 261 261 pr_debug("%s: real do\n", __func__); 262 262 napi_disable(&card->napi); 263 + timer_delete_sync(&card->rx_oom_timer); 263 264 /* 264 265 * Disable irq. Wireless interrupts will 265 266 * be disabled later if any ··· 971 970 * gelic_card_decode_one_descr - processes an rx descriptor 972 971 * @card: card structure 973 972 * 974 - * returns 1 if a packet has been sent to the stack, otherwise 0 973 + * returns 1 if a packet has been sent to the stack, -ENOMEM on skb alloc 974 + * failure, otherwise 0 975 975 * 976 976 * processes an rx descriptor by iommu-unmapping the data buffer and passing 977 977 * the packet up to the stack ··· 983 981 struct gelic_descr_chain *chain = &card->rx_chain; 984 982 struct gelic_descr *descr = chain->head; 985 983 struct net_device *netdev = NULL; 986 - int dmac_chain_ended; 984 + int dmac_chain_ended = 0; 985 + int prepare_rx_ret; 987 986 988 987 status = gelic_descr_get_status(descr); 989 988 990 989 if (status == GELIC_DESCR_DMA_CARDOWNED) 991 990 return 0; 992 991 993 - if (status == GELIC_DESCR_DMA_NOT_IN_USE) { 992 + if (status == GELIC_DESCR_DMA_NOT_IN_USE || !descr->skb) { 994 993 dev_dbg(ctodev(card), "dormant descr? %p\n", descr); 995 - return 0; 994 + dmac_chain_ended = 1; 995 + goto refill; 996 996 } 997 997 998 998 /* netdevice select */ ··· 1052 1048 refill: 1053 1049 1054 1050 /* is the current descriptor terminated with next_descr == NULL? */ 1055 - dmac_chain_ended = 1056 - be32_to_cpu(descr->hw_regs.dmac_cmd_status) & 1057 - GELIC_DESCR_RX_DMA_CHAIN_END; 1051 + if (!dmac_chain_ended) 1052 + dmac_chain_ended = 1053 + be32_to_cpu(descr->hw_regs.dmac_cmd_status) & 1054 + GELIC_DESCR_RX_DMA_CHAIN_END; 1058 1055 /* 1059 1056 * So that always DMAC can see the end 1060 1057 * of the descriptor chain to avoid ··· 1067 1062 gelic_descr_set_status(descr, GELIC_DESCR_DMA_NOT_IN_USE); 1068 1063 1069 1064 /* 1070 - * this call can fail, but for now, just leave this 1071 - * descriptor without skb 1065 + * this call can fail, propagate the error 1072 1066 */ 1073 - gelic_descr_prepare_rx(card, descr); 1067 + prepare_rx_ret = gelic_descr_prepare_rx(card, descr); 1068 + if (prepare_rx_ret) 1069 + return prepare_rx_ret; 1074 1070 1075 1071 chain->tail = descr; 1076 1072 chain->head = descr->next; ··· 1093 1087 return 1; 1094 1088 } 1095 1089 1090 + static void gelic_rx_oom_timer(struct timer_list *t) 1091 + { 1092 + struct gelic_card *card = timer_container_of(card, t, rx_oom_timer); 1093 + 1094 + napi_schedule(&card->napi); 1095 + } 1096 + 1096 1097 /** 1097 1098 * gelic_net_poll - NAPI poll function called by the stack to return packets 1098 1099 * @napi: napi structure ··· 1112 1099 { 1113 1100 struct gelic_card *card = container_of(napi, struct gelic_card, napi); 1114 1101 int packets_done = 0; 1102 + int work_result = 0; 1115 1103 1116 1104 while (packets_done < budget) { 1117 - if (!gelic_card_decode_one_descr(card)) 1105 + work_result = gelic_card_decode_one_descr(card); 1106 + if (work_result != 1) 1118 1107 break; 1119 1108 1120 1109 packets_done++; 1110 + } 1111 + 1112 + if (work_result == -ENOMEM) { 1113 + napi_complete_done(napi, packets_done); 1114 + mod_timer(&card->rx_oom_timer, jiffies + 1); 1115 + return packets_done; 1121 1116 } 1122 1117 1123 1118 if (packets_done < budget) { ··· 1596 1575 atomic_set(&card->tx_timeout_task_counter, 0); 1597 1576 mutex_init(&card->updown_lock); 1598 1577 atomic_set(&card->users, 0); 1578 + 1579 + timer_setup(&card->rx_oom_timer, gelic_rx_oom_timer, 0); 1599 1580 1600 1581 return card; 1601 1582 }
+1
drivers/net/ethernet/toshiba/ps3_gelic_net.h
··· 268 268 struct gelic_card { 269 269 struct napi_struct napi; 270 270 struct net_device *netdev[GELIC_PORT_MAX]; 271 + struct timer_list rx_oom_timer; 271 272 /* 272 273 * hypervisor requires irq_status should be 273 274 * 8 bytes aligned, but u64 member is