Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

ibmveth: Add multi buffers rx replenishment hcall support

This patch enables batched RX buffer replenishment in ibmveth by
using the new firmware-supported h_add_logical_lan_buffers() hcall
to submit up to 8 RX buffers in a single call, instead of repeatedly
calling the single-buffer h_add_logical_lan_buffer() hcall.

During the probe, with the patch, the driver queries ILLAN attributes
to detect IBMVETH_ILLAN_RX_MULTI_BUFF_SUPPORT bit. If the attribute is
present, rx_buffers_per_hcall is set to 8, enabling batched replenishment.
Otherwise, it defaults to 1, preserving the original upstream behavior
with no change in code flow for unsupported systems.

The core rx replenish logic remains the same. But when batching
is enabled, the driver aggregates up to 8 fully prepared descriptors
into a single h_add_logical_lan_buffers() hypercall. If any allocation
or DMA mapping fails while preparing a batch, only the successfully
prepared buffers are submitted, and the remaining are deferred for
the next replenish cycle.

If at runtime the firmware stops accepting the batched hcall—e,g,
after a Live Partition Migration (LPM) to a host that does not
support h_add_logical_lan_buffers(), the hypercall returns H_FUNCTION.
In that case, the driver transparently disables batching, resets
rx_buffers_per_hcall to 1, and falls back to the single-buffer hcall
in next future replenishments to take care of these and future buffers.

Test were done on systems with firmware that both supports and
does not support the new h_add_logical_lan_buffers hcall.

On supported firmware, this reduces hypercall overhead significantly
over multiple buffers. SAR measurements showed about a 15% improvement
in packet processing rate under moderate RX load, with heavier traffic
seeing gains more than 30%

Signed-off-by: Mingming Cao <mmc@linux.ibm.com>
Reviewed-by: Brian King <bjking1@linux.ibm.com>
Reviewed-by: Haren Myneni <haren@linux.ibm.com>
Reviewed-by: Dave Marquardt <davemarq@linux.ibm.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250719091356.57252-1-mmc@linux.ibm.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

authored by

Mingming Cao and committed by
Paolo Abeni
2094200b db8a5149

+182 -76
+1
arch/powerpc/include/asm/hvcall.h
··· 270 270 #define H_QUERY_INT_STATE 0x1E4 271 271 #define H_POLL_PENDING 0x1D8 272 272 #define H_ILLAN_ATTRIBUTES 0x244 273 + #define H_ADD_LOGICAL_LAN_BUFFERS 0x248 273 274 #define H_MODIFY_HEA_QP 0x250 274 275 #define H_QUERY_HEA_QP 0x254 275 276 #define H_QUERY_HEA 0x258
+160 -76
drivers/net/ethernet/ibm/ibmveth.c
··· 211 211 static void ibmveth_replenish_buffer_pool(struct ibmveth_adapter *adapter, 212 212 struct ibmveth_buff_pool *pool) 213 213 { 214 - u32 i; 215 - u32 count = pool->size - atomic_read(&pool->available); 216 - u32 buffers_added = 0; 217 - struct sk_buff *skb; 218 - unsigned int free_index, index; 219 - u64 correlator; 214 + union ibmveth_buf_desc descs[IBMVETH_MAX_RX_PER_HCALL] = {0}; 215 + u32 remaining = pool->size - atomic_read(&pool->available); 216 + u64 correlators[IBMVETH_MAX_RX_PER_HCALL] = {0}; 220 217 unsigned long lpar_rc; 218 + u32 buffers_added = 0; 219 + u32 i, filled, batch; 220 + struct vio_dev *vdev; 221 221 dma_addr_t dma_addr; 222 + struct device *dev; 223 + u32 index; 224 + 225 + vdev = adapter->vdev; 226 + dev = &vdev->dev; 222 227 223 228 mb(); 224 229 225 - for (i = 0; i < count; ++i) { 226 - union ibmveth_buf_desc desc; 230 + batch = adapter->rx_buffers_per_hcall; 227 231 228 - free_index = pool->consumer_index; 229 - index = pool->free_map[free_index]; 230 - skb = NULL; 232 + while (remaining > 0) { 233 + unsigned int free_index = pool->consumer_index; 231 234 232 - if (WARN_ON(index == IBM_VETH_INVALID_MAP)) { 233 - schedule_work(&adapter->work); 234 - goto bad_index_failure; 235 + /* Fill a batch of descriptors */ 236 + for (filled = 0; filled < min(remaining, batch); filled++) { 237 + index = pool->free_map[free_index]; 238 + if (WARN_ON(index == IBM_VETH_INVALID_MAP)) { 239 + adapter->replenish_add_buff_failure++; 240 + netdev_info(adapter->netdev, 241 + "Invalid map index %u, reset\n", 242 + index); 243 + schedule_work(&adapter->work); 244 + break; 245 + } 246 + 247 + if (!pool->skbuff[index]) { 248 + struct sk_buff *skb = NULL; 249 + 250 + skb = netdev_alloc_skb(adapter->netdev, 251 + pool->buff_size); 252 + if (!skb) { 253 + adapter->replenish_no_mem++; 254 + adapter->replenish_add_buff_failure++; 255 + break; 256 + } 257 + 258 + dma_addr = dma_map_single(dev, skb->data, 259 + pool->buff_size, 260 + DMA_FROM_DEVICE); 261 + if (dma_mapping_error(dev, dma_addr)) { 262 + dev_kfree_skb_any(skb); 263 + adapter->replenish_add_buff_failure++; 264 + break; 265 + } 266 + 267 + pool->dma_addr[index] = dma_addr; 268 + pool->skbuff[index] = skb; 269 + } else { 270 + /* re-use case */ 271 + dma_addr = pool->dma_addr[index]; 272 + } 273 + 274 + if (rx_flush) { 275 + unsigned int len; 276 + 277 + len = adapter->netdev->mtu + IBMVETH_BUFF_OH; 278 + len = min(pool->buff_size, len); 279 + ibmveth_flush_buffer(pool->skbuff[index]->data, 280 + len); 281 + } 282 + 283 + descs[filled].fields.flags_len = IBMVETH_BUF_VALID | 284 + pool->buff_size; 285 + descs[filled].fields.address = dma_addr; 286 + 287 + correlators[filled] = ((u64)pool->index << 32) | index; 288 + *(u64 *)pool->skbuff[index]->data = correlators[filled]; 289 + 290 + free_index++; 291 + if (free_index >= pool->size) 292 + free_index = 0; 235 293 } 236 294 237 - /* are we allocating a new buffer or recycling an old one */ 238 - if (pool->skbuff[index]) 239 - goto reuse; 240 - 241 - skb = netdev_alloc_skb(adapter->netdev, pool->buff_size); 242 - 243 - if (!skb) { 244 - netdev_dbg(adapter->netdev, 245 - "replenish: unable to allocate skb\n"); 246 - adapter->replenish_no_mem++; 295 + if (!filled) 247 296 break; 248 - } 249 297 250 - dma_addr = dma_map_single(&adapter->vdev->dev, skb->data, 251 - pool->buff_size, DMA_FROM_DEVICE); 252 - 253 - if (dma_mapping_error(&adapter->vdev->dev, dma_addr)) 254 - goto failure; 255 - 256 - pool->dma_addr[index] = dma_addr; 257 - pool->skbuff[index] = skb; 258 - 259 - if (rx_flush) { 260 - unsigned int len = min(pool->buff_size, 261 - adapter->netdev->mtu + 262 - IBMVETH_BUFF_OH); 263 - ibmveth_flush_buffer(skb->data, len); 264 - } 265 - reuse: 266 - dma_addr = pool->dma_addr[index]; 267 - desc.fields.flags_len = IBMVETH_BUF_VALID | pool->buff_size; 268 - desc.fields.address = dma_addr; 269 - 270 - correlator = ((u64)pool->index << 32) | index; 271 - *(u64 *)pool->skbuff[index]->data = correlator; 272 - 273 - lpar_rc = h_add_logical_lan_buffer(adapter->vdev->unit_address, 274 - desc.desc); 275 - 298 + /* single buffer case*/ 299 + if (filled == 1) 300 + lpar_rc = h_add_logical_lan_buffer(vdev->unit_address, 301 + descs[0].desc); 302 + else 303 + /* Multi-buffer hcall */ 304 + lpar_rc = h_add_logical_lan_buffers(vdev->unit_address, 305 + descs[0].desc, 306 + descs[1].desc, 307 + descs[2].desc, 308 + descs[3].desc, 309 + descs[4].desc, 310 + descs[5].desc, 311 + descs[6].desc, 312 + descs[7].desc); 276 313 if (lpar_rc != H_SUCCESS) { 277 - netdev_warn(adapter->netdev, 278 - "%sadd_logical_lan failed %lu\n", 279 - skb ? "" : "When recycling: ", lpar_rc); 280 - goto failure; 314 + dev_warn_ratelimited(dev, 315 + "RX h_add_logical_lan failed: filled=%u, rc=%lu, batch=%u\n", 316 + filled, lpar_rc, batch); 317 + goto hcall_failure; 281 318 } 282 319 283 - pool->free_map[free_index] = IBM_VETH_INVALID_MAP; 284 - pool->consumer_index++; 285 - if (pool->consumer_index >= pool->size) 286 - pool->consumer_index = 0; 320 + /* Only update pool state after hcall succeeds */ 321 + for (i = 0; i < filled; i++) { 322 + free_index = pool->consumer_index; 323 + pool->free_map[free_index] = IBM_VETH_INVALID_MAP; 287 324 288 - buffers_added++; 289 - adapter->replenish_add_buff_success++; 325 + pool->consumer_index++; 326 + if (pool->consumer_index >= pool->size) 327 + pool->consumer_index = 0; 328 + } 329 + 330 + buffers_added += filled; 331 + adapter->replenish_add_buff_success += filled; 332 + remaining -= filled; 333 + 334 + memset(&descs, 0, sizeof(descs)); 335 + memset(&correlators, 0, sizeof(correlators)); 336 + continue; 337 + 338 + hcall_failure: 339 + for (i = 0; i < filled; i++) { 340 + index = correlators[i] & 0xffffffffUL; 341 + dma_addr = pool->dma_addr[index]; 342 + 343 + if (pool->skbuff[index]) { 344 + if (dma_addr && 345 + !dma_mapping_error(dev, dma_addr)) 346 + dma_unmap_single(dev, dma_addr, 347 + pool->buff_size, 348 + DMA_FROM_DEVICE); 349 + 350 + dev_kfree_skb_any(pool->skbuff[index]); 351 + pool->skbuff[index] = NULL; 352 + } 353 + } 354 + adapter->replenish_add_buff_failure += filled; 355 + 356 + /* 357 + * If multi rx buffers hcall is no longer supported by FW 358 + * e.g. in the case of Live Parttion Migration 359 + */ 360 + if (batch > 1 && lpar_rc == H_FUNCTION) { 361 + /* 362 + * Instead of retry submit single buffer individually 363 + * here just set the max rx buffer per hcall to 1 364 + * buffers will be respleshed next time 365 + * when ibmveth_replenish_buffer_pool() is called again 366 + * with single-buffer case 367 + */ 368 + netdev_info(adapter->netdev, 369 + "RX Multi buffers not supported by FW, rc=%lu\n", 370 + lpar_rc); 371 + adapter->rx_buffers_per_hcall = 1; 372 + netdev_info(adapter->netdev, 373 + "Next rx replesh will fall back to single-buffer hcall\n"); 374 + } 375 + break; 290 376 } 291 - 292 - mb(); 293 - atomic_add(buffers_added, &(pool->available)); 294 - return; 295 - 296 - failure: 297 - 298 - if (dma_addr && !dma_mapping_error(&adapter->vdev->dev, dma_addr)) 299 - dma_unmap_single(&adapter->vdev->dev, 300 - pool->dma_addr[index], pool->buff_size, 301 - DMA_FROM_DEVICE); 302 - dev_kfree_skb_any(pool->skbuff[index]); 303 - pool->skbuff[index] = NULL; 304 - bad_index_failure: 305 - adapter->replenish_add_buff_failure++; 306 377 307 378 mb(); 308 379 atomic_add(buffers_added, &(pool->available)); ··· 1852 1781 adapter->is_active_trunk = true; 1853 1782 netdev->hw_features |= NETIF_F_FRAGLIST; 1854 1783 netdev->features |= NETIF_F_FRAGLIST; 1784 + } 1785 + 1786 + if (ret == H_SUCCESS && 1787 + (ret_attr & IBMVETH_ILLAN_RX_MULTI_BUFF_SUPPORT)) { 1788 + adapter->rx_buffers_per_hcall = IBMVETH_MAX_RX_PER_HCALL; 1789 + netdev_dbg(netdev, 1790 + "RX Multi-buffer hcall supported by FW, batch set to %u\n", 1791 + adapter->rx_buffers_per_hcall); 1792 + } else { 1793 + adapter->rx_buffers_per_hcall = 1; 1794 + netdev_dbg(netdev, 1795 + "RX Single-buffer hcall mode, batch set to %u\n", 1796 + adapter->rx_buffers_per_hcall); 1855 1797 } 1856 1798 1857 1799 netdev->min_mtu = IBMVETH_MIN_MTU;
+21
drivers/net/ethernet/ibm/ibmveth.h
··· 28 28 #define IbmVethMcastRemoveFilter 0x2UL 29 29 #define IbmVethMcastClearFilterTable 0x3UL 30 30 31 + #define IBMVETH_ILLAN_RX_MULTI_BUFF_SUPPORT 0x0000000000040000UL 31 32 #define IBMVETH_ILLAN_LRG_SR_ENABLED 0x0000000000010000UL 32 33 #define IBMVETH_ILLAN_LRG_SND_SUPPORT 0x0000000000008000UL 33 34 #define IBMVETH_ILLAN_PADDED_PKT_CSUM 0x0000000000002000UL ··· 46 45 47 46 #define h_add_logical_lan_buffer(ua, buf) \ 48 47 plpar_hcall_norets(H_ADD_LOGICAL_LAN_BUFFER, ua, buf) 48 + 49 + static inline long h_add_logical_lan_buffers(unsigned long unit_address, 50 + unsigned long desc1, 51 + unsigned long desc2, 52 + unsigned long desc3, 53 + unsigned long desc4, 54 + unsigned long desc5, 55 + unsigned long desc6, 56 + unsigned long desc7, 57 + unsigned long desc8) 58 + { 59 + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; 60 + 61 + return plpar_hcall9(H_ADD_LOGICAL_LAN_BUFFERS, 62 + retbuf, unit_address, 63 + desc1, desc2, desc3, desc4, 64 + desc5, desc6, desc7, desc8); 65 + } 49 66 50 67 /* FW allows us to send 6 descriptors but we only use one so mark 51 68 * the other 5 as unused (0) ··· 120 101 #define IBMVETH_MAX_TX_BUF_SIZE (1024 * 64) 121 102 #define IBMVETH_MAX_QUEUES 16U 122 103 #define IBMVETH_DEFAULT_QUEUES 8U 104 + #define IBMVETH_MAX_RX_PER_HCALL 8U 123 105 124 106 static int pool_size[] = { 512, 1024 * 2, 1024 * 16, 1024 * 32, 1024 * 64 }; 125 107 static int pool_count[] = { 256, 512, 256, 256, 256 }; ··· 171 151 int rx_csum; 172 152 int large_send; 173 153 bool is_active_trunk; 154 + unsigned int rx_buffers_per_hcall; 174 155 175 156 u64 fw_ipv6_csum_support; 176 157 u64 fw_ipv4_csum_support;