Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'amd-xgbe-tx-resilience-improvements-for-link-down-handling'

Raju Rangoju says:

====================
amd-xgbe: TX resilience improvements for link-down handling

This series enhances the AMD 10GbE driver's TX queue handling during
link-down events to improve resilience, prevent resource leaks, and
enable fast failover in link aggregation configurations.

The three patches form a complete link-down handling solution:

1. Patch 1: Fast detection (know quickly when link goes down)
2. Patch 2: Quick response (stop TX immediately, skip waits)
3. Patch 3: Clean recovery (reclaim abandoned resources)
====================

Link: https://patch.msgid.link/20260319163251.1808611-1-Raju.Rangoju@amd.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

+159 -18
+4
drivers/net/ethernet/amd/xgbe/xgbe-common.h
··· 330 330 #define MAC_ISR_SMI_WIDTH 1 331 331 #define MAC_ISR_TSIS_INDEX 12 332 332 #define MAC_ISR_TSIS_WIDTH 1 333 + #define MAC_ISR_LS_INDEX 24 334 + #define MAC_ISR_LS_WIDTH 2 335 + #define MAC_ISR_LSI_INDEX 0 336 + #define MAC_ISR_LSI_WIDTH 1 333 337 #define MAC_MACA1HR_AE_INDEX 31 334 338 #define MAC_MACA1HR_AE_WIDTH 1 335 339 #define MAC_MDIOIER_SNGLCOMPIE_INDEX 12
+76 -12
drivers/net/ethernet/amd/xgbe/xgbe-dev.c
··· 3186 3186 /* The Tx engine cannot be stopped if it is actively processing 3187 3187 * packets. Wait for the Tx queue to empty the Tx fifo. Don't 3188 3188 * wait forever though... 3189 + * 3190 + * Optimization: Skip the wait when link is down. Hardware won't 3191 + * complete TX processing, so waiting serves no purpose and only 3192 + * delays interface shutdown. Descriptors will be reclaimed via 3193 + * the force-cleanup path in tx_poll. 3189 3194 */ 3195 + 3196 + if (!pdata->phy.link) 3197 + return; 3198 + 3190 3199 tx_timeout = jiffies + (XGBE_DMA_STOP_TIMEOUT * HZ); 3191 3200 while (time_before(jiffies, tx_timeout)) { 3192 3201 tx_status = XGMAC_MTL_IOREAD(pdata, queue, MTL_Q_TQDR); ··· 3276 3267 XGMAC_IOWRITE_BITS(pdata, MAC_TCR, TE, 1); 3277 3268 } 3278 3269 3270 + /** 3271 + * xgbe_wait_for_dma_tx_complete - Wait for DMA to complete pending TX 3272 + * @pdata: driver private data 3273 + * 3274 + * Wait for the DMA TX channels to complete all pending descriptors. 3275 + * This ensures no frames are in-flight before we disable the transmitter. 3276 + * If link is down, return immediately as TX will never complete. 3277 + * 3278 + * Return: 0 on success, -ETIMEDOUT on timeout 3279 + */ 3280 + static int xgbe_wait_for_dma_tx_complete(struct xgbe_prv_data *pdata) 3281 + { 3282 + struct xgbe_channel *channel; 3283 + struct xgbe_ring *ring; 3284 + unsigned long timeout; 3285 + unsigned int i; 3286 + bool complete; 3287 + 3288 + /* If link is down, TX will never complete - skip waiting */ 3289 + if (!pdata->phy.link) 3290 + return 0; 3291 + 3292 + timeout = jiffies + (XGBE_DMA_STOP_TIMEOUT * HZ); 3293 + 3294 + do { 3295 + complete = true; 3296 + 3297 + for (i = 0; i < pdata->channel_count; i++) { 3298 + channel = pdata->channel[i]; 3299 + ring = channel->tx_ring; 3300 + if (!ring) 3301 + continue; 3302 + 3303 + /* Check if DMA has processed all descriptors */ 3304 + if (ring->dirty != ring->cur) { 3305 + complete = false; 3306 + break; 3307 + } 3308 + } 3309 + 3310 + if (complete) 3311 + return 0; 3312 + 3313 + usleep_range(100, 200); 3314 + } while (time_before(jiffies, timeout)); 3315 + 3316 + netif_warn(pdata, drv, pdata->netdev, 3317 + "timeout waiting for DMA TX to complete\n"); 3318 + return -ETIMEDOUT; 3319 + } 3320 + 3279 3321 static void xgbe_disable_tx(struct xgbe_prv_data *pdata) 3280 3322 { 3281 3323 unsigned int i; 3282 3324 3283 - /* Prepare for Tx DMA channel stop */ 3284 - for (i = 0; i < pdata->tx_q_count; i++) 3285 - xgbe_prepare_tx_stop(pdata, i); 3325 + /* Step 1: Wait for DMA to complete pending descriptors */ 3326 + xgbe_wait_for_dma_tx_complete(pdata); 3286 3327 3287 - /* Disable MAC Tx */ 3288 - XGMAC_IOWRITE_BITS(pdata, MAC_TCR, TE, 0); 3289 - 3290 - /* Disable each Tx queue */ 3291 - for (i = 0; i < pdata->tx_q_count; i++) 3292 - XGMAC_MTL_IOWRITE_BITS(pdata, i, MTL_Q_TQOMR, TXQEN, 0); 3293 - 3294 - /* Disable each Tx DMA channel */ 3328 + /* Step 2: Disable each Tx DMA channel to stop 3329 + * processing new descriptors 3330 + */ 3295 3331 for (i = 0; i < pdata->channel_count; i++) { 3296 3332 if (!pdata->channel[i]->tx_ring) 3297 3333 break; 3298 - 3299 3334 XGMAC_DMA_IOWRITE_BITS(pdata->channel[i], DMA_CH_TCR, ST, 0); 3300 3335 } 3336 + 3337 + /* Step 3: Wait for MTL TX queues to drain */ 3338 + for (i = 0; i < pdata->tx_q_count; i++) 3339 + xgbe_prepare_tx_stop(pdata, i); 3340 + 3341 + /* Step 4: Disable MTL TX queues */ 3342 + for (i = 0; i < pdata->tx_q_count; i++) 3343 + XGMAC_MTL_IOWRITE_BITS(pdata, i, MTL_Q_TQOMR, TXQEN, 0); 3344 + 3345 + /* Step 5: Disable MAC TX last */ 3346 + XGMAC_IOWRITE_BITS(pdata, MAC_TCR, TE, 0); 3301 3347 } 3302 3348 3303 3349 static void xgbe_prepare_rx_stop(struct xgbe_prv_data *pdata,
+61 -6
drivers/net/ethernet/amd/xgbe/xgbe-drv.c
··· 607 607 struct xgbe_prv_data *pdata = timer_container_of(pdata, t, 608 608 service_timer); 609 609 struct xgbe_channel *channel; 610 + unsigned int poll_interval; 610 611 unsigned int i; 611 612 612 613 queue_work(pdata->dev_workqueue, &pdata->service_work); 613 614 614 - mod_timer(&pdata->service_timer, jiffies + HZ); 615 + /* Adaptive link status polling for fast failure detection: 616 + * 617 + * - When carrier is UP: poll every 100ms for rapid link-down detection 618 + * Enables sub-second response to link failures, minimizing traffic 619 + * loss. 620 + * 621 + * - When carrier is DOWN: poll every 1s to conserve CPU resources 622 + * Link-up events are less time-critical. 623 + * 624 + * The 100ms active polling interval balances responsiveness with 625 + * efficiency: 626 + * - Provides ~100-200ms link-down detection (10x faster than 1s 627 + * polling) 628 + * - Minimal CPU overhead (1% vs 0.1% with 1s polling) 629 + * - Enables fast failover in link aggregation deployments 630 + */ 631 + if (netif_running(pdata->netdev) && netif_carrier_ok(pdata->netdev)) 632 + poll_interval = msecs_to_jiffies(100); /* 100ms when up */ 633 + else 634 + poll_interval = HZ; /* 1 second when down */ 635 + 636 + mod_timer(&pdata->service_timer, jiffies + poll_interval); 615 637 616 638 if (!pdata->tx_usecs) 617 639 return; ··· 2169 2147 struct net_device *netdev = pdata->netdev; 2170 2148 struct netdev_queue *txq; 2171 2149 int processed = 0; 2150 + int force_cleanup; 2172 2151 unsigned int tx_packets = 0, tx_bytes = 0; 2173 2152 unsigned int cur; 2174 2153 ··· 2186 2163 2187 2164 txq = netdev_get_tx_queue(netdev, channel->queue_index); 2188 2165 2166 + /* Smart descriptor cleanup during link-down conditions. 2167 + * 2168 + * When link is down, hardware stops processing TX descriptors (OWN bit 2169 + * remains set). Enable intelligent cleanup to reclaim these abandoned 2170 + * descriptors and maintain TX queue health. 2171 + * 2172 + * This cleanup mechanism enables: 2173 + * - Continuous TX queue availability for new packets when link recovers 2174 + * - Clean resource management (skbs, DMA mappings, descriptors) 2175 + * - Fast failover in link aggregation scenarios 2176 + */ 2177 + force_cleanup = !pdata->phy.link; 2178 + 2189 2179 while ((processed < XGBE_TX_DESC_MAX_PROC) && 2190 2180 (ring->dirty != cur)) { 2191 2181 rdata = XGBE_GET_DESC_DATA(ring, ring->dirty); 2192 2182 rdesc = rdata->rdesc; 2193 2183 2194 - if (!hw_if->tx_complete(rdesc)) 2195 - break; 2184 + if (!hw_if->tx_complete(rdesc)) { 2185 + if (!force_cleanup) 2186 + break; 2187 + /* Link-down descriptor cleanup: reclaim abandoned 2188 + * resources. 2189 + * 2190 + * Hardware has stopped processing this descriptor, so 2191 + * perform intelligent cleanup to free skbs and reclaim 2192 + * descriptors for future use when link recovers. 2193 + * 2194 + * These are not counted as successful transmissions 2195 + * since packets never reached the wire. 2196 + */ 2197 + netif_dbg(pdata, tx_err, netdev, 2198 + "force-freeing stuck TX desc %u (link down)\n", 2199 + ring->dirty); 2200 + } 2196 2201 2197 2202 /* Make sure descriptor fields are read after reading the OWN 2198 2203 * bit */ ··· 2229 2178 if (netif_msg_tx_done(pdata)) 2230 2179 xgbe_dump_tx_desc(pdata, ring, ring->dirty, 1, 0); 2231 2180 2232 - if (hw_if->is_last_desc(rdesc)) { 2233 - tx_packets += rdata->tx.packets; 2234 - tx_bytes += rdata->tx.bytes; 2181 + /* Only count packets actually transmitted (not force-cleaned) 2182 + */ 2183 + if (!force_cleanup || hw_if->is_last_desc(rdesc)) { 2184 + if (hw_if->is_last_desc(rdesc)) { 2185 + tx_packets += rdata->tx.packets; 2186 + tx_bytes += rdata->tx.bytes; 2187 + } 2235 2188 } 2236 2189 2237 2190 /* Free the SKB and reset the descriptor for re-use */
+18
drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
··· 1047 1047 if (pdata->phy_link != pdata->phy.link) { 1048 1048 new_state = 1; 1049 1049 pdata->phy_link = pdata->phy.link; 1050 + 1051 + /* Link is coming up - wake TX queues */ 1052 + netif_tx_wake_all_queues(pdata->netdev); 1050 1053 } 1051 1054 } else if (pdata->phy_link) { 1052 1055 new_state = 1; 1053 1056 pdata->phy_link = 0; 1054 1057 pdata->phy_speed = SPEED_UNKNOWN; 1058 + 1059 + /* Proactive TX queue management on link-down. 1060 + * 1061 + * Immediately stop TX queues to enable clean link-down 1062 + * handling: 1063 + * - Prevents queueing packets that can't be transmitted 1064 + * - Allows orderly descriptor cleanup by NAPI poll 1065 + * - Enables rapid failover in link aggregation configurations 1066 + * 1067 + * Note: We do NOT call netdev_tx_reset_queue() here because 1068 + * NAPI poll may still be running and would trigger BQL 1069 + * assertion. BQL state is cleaned up naturally during 1070 + * descriptor reclamation. 1071 + */ 1072 + netif_tx_stop_all_queues(pdata->netdev); 1055 1073 } 1056 1074 1057 1075 if (new_state && netif_msg_link(pdata))