Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'virtio-net-support-af_xdp-zero-copy'

Xuan Zhuo says:

====================
virtio-net: support AF_XDP zero copy

v5: http://lore.kernel.org/all/20240611114147.31320-1-xuanzhuo@linux.alibaba.com

XDP socket(AF_XDP) is an excellent bypass kernel network framework. The zero
copy feature of xsk (XDP socket) needs to be supported by the driver. The
performance of zero copy is very good. mlx5 and intel ixgbe already support
this feature, This patch set allows virtio-net to support xsk's zerocopy xmit
feature.

At present, we have completed some preparation:

1. vq-reset (virtio spec and kernel code)
2. virtio-core premapped dma
3. virtio-net xdp refactor

So it is time for Virtio-Net to complete the support for the XDP Socket
Zerocopy.

Virtio-net can not increase the queue num at will, so xsk shares the queue with
kernel.

On the other hand, Virtio-Net does not support generate interrupt from driver
manually, so when we wakeup tx xmit, we used some tips. If the CPU run by TX
NAPI last time is other CPUs, use IPI to wake up NAPI on the remote CPU. If it
is also the local CPU, then we wake up napi directly.

This patch set includes some refactor to the virtio-net to let that to support
AF_XDP.

Because there are too many commits, the work of virtio net supporting af-xdp is
split to rx part and tx part. This patch set is for rx part.

So the flag NETDEV_XDP_ACT_XSK_ZEROCOPY is not added, if someone want to test
for af-xdp rx, the flag needs to be adding locally.

ENV: Qemu with vhost-user(polling mode).
Host CPU: Intel(R) Xeon(R) Platinum 8163 CPU @ 2.50GHz

testpmd> show port stats all

######################## NIC statistics for port 0 ########################
RX-packets: 19531092064 RX-missed: 0 RX-bytes: 1093741155584
RX-errors: 0
RX-nombuf: 0
TX-packets: 5959955552 TX-errors: 0 TX-bytes: 371030645664

Throughput (since last show)
Rx-pps: 8861574 Rx-bps: 3969985208
Tx-pps: 8861493 Tx-bps: 3969962736
############################################################################

testpmd> show port stats all

######################## NIC statistics for port 0 ########################
RX-packets: 68152727 RX-missed: 0 RX-bytes: 3816552712
RX-errors: 0
RX-nombuf: 0
TX-packets: 68114967 TX-errors: 33216 TX-bytes: 3814438152

Throughput (since last show)
Rx-pps: 6333196 Rx-bps: 2837272088
Tx-pps: 6333227 Tx-bps: 2837285936
############################################################################

But AF_XDP consumes more CPU for tx and rx napi(100% and 86%).
====================

Link: https://patch.msgid.link/20240708112537.96291-1-xuanzhuo@linux.alibaba.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+676 -94
+676 -94
drivers/net/virtio_net.c
··· 25 25 #include <net/net_failover.h> 26 26 #include <net/netdev_rx_queue.h> 27 27 #include <net/netdev_queues.h> 28 + #include <net/xdp_sock_drv.h> 28 29 29 30 static int napi_weight = NAPI_POLL_WEIGHT; 30 31 module_param(napi_weight, int, 0444); ··· 40 39 #define GOOD_COPY_LEN 128 41 40 42 41 #define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) 43 - 44 - /* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */ 45 - #define VIRTIO_XDP_HEADROOM 256 46 42 47 43 /* Separating two types of XDP xmit */ 48 44 #define VIRTIO_XDP_TX BIT(0) ··· 349 351 350 352 /* Record the last dma info to free after new pages is allocated. */ 351 353 struct virtnet_rq_dma *last_dma; 354 + 355 + struct xsk_buff_pool *xsk_pool; 356 + 357 + /* xdp rxq used by xsk */ 358 + struct xdp_rxq_info xsk_rxq_info; 359 + 360 + struct xdp_buff **xsk_buffs; 352 361 }; 353 362 354 363 /* This structure can contain rss message with maximum settings for indirection table and keysize ··· 498 493 }; 499 494 500 495 static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf); 496 + static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp, 497 + struct net_device *dev, 498 + unsigned int *xdp_xmit, 499 + struct virtnet_rq_stats *stats); 500 + static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq, 501 + struct sk_buff *skb, u8 flags); 502 + static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb, 503 + struct sk_buff *curr_skb, 504 + struct page *page, void *buf, 505 + int len, int truesize); 501 506 502 507 static bool is_xdp_frame(void *ptr) 503 508 { ··· 988 973 989 974 rq = &vi->rq[i]; 990 975 976 + if (rq->xsk_pool) { 977 + xsk_buff_free((struct xdp_buff *)buf); 978 + return; 979 + } 980 + 991 981 if (!vi->big_packets || vi->mergeable_rx_bufs) 992 982 virtnet_rq_unmap(rq, buf, 0); 993 983 ··· 1069 1049 } 1070 1050 } 1071 1051 } 1052 + } 1053 + 1054 + static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len) 1055 + { 1056 + sg->dma_address = addr; 1057 + sg->length = len; 1058 + } 1059 + 1060 + static struct xdp_buff *buf_to_xdp(struct virtnet_info *vi, 1061 + struct receive_queue *rq, void *buf, u32 len) 1062 + { 1063 + struct xdp_buff *xdp; 1064 + u32 bufsize; 1065 + 1066 + xdp = (struct xdp_buff *)buf; 1067 + 1068 + bufsize = xsk_pool_get_rx_frame_size(rq->xsk_pool) + vi->hdr_len; 1069 + 1070 + if (unlikely(len > bufsize)) { 1071 + pr_debug("%s: rx error: len %u exceeds truesize %u\n", 1072 + vi->dev->name, len, bufsize); 1073 + DEV_STATS_INC(vi->dev, rx_length_errors); 1074 + xsk_buff_free(xdp); 1075 + return NULL; 1076 + } 1077 + 1078 + xsk_buff_set_size(xdp, len); 1079 + xsk_buff_dma_sync_for_cpu(xdp); 1080 + 1081 + return xdp; 1082 + } 1083 + 1084 + static struct sk_buff *xsk_construct_skb(struct receive_queue *rq, 1085 + struct xdp_buff *xdp) 1086 + { 1087 + unsigned int metasize = xdp->data - xdp->data_meta; 1088 + struct sk_buff *skb; 1089 + unsigned int size; 1090 + 1091 + size = xdp->data_end - xdp->data_hard_start; 1092 + skb = napi_alloc_skb(&rq->napi, size); 1093 + if (unlikely(!skb)) { 1094 + xsk_buff_free(xdp); 1095 + return NULL; 1096 + } 1097 + 1098 + skb_reserve(skb, xdp->data_meta - xdp->data_hard_start); 1099 + 1100 + size = xdp->data_end - xdp->data_meta; 1101 + memcpy(__skb_put(skb, size), xdp->data_meta, size); 1102 + 1103 + if (metasize) { 1104 + __skb_pull(skb, metasize); 1105 + skb_metadata_set(skb, metasize); 1106 + } 1107 + 1108 + xsk_buff_free(xdp); 1109 + 1110 + return skb; 1111 + } 1112 + 1113 + static struct sk_buff *virtnet_receive_xsk_small(struct net_device *dev, struct virtnet_info *vi, 1114 + struct receive_queue *rq, struct xdp_buff *xdp, 1115 + unsigned int *xdp_xmit, 1116 + struct virtnet_rq_stats *stats) 1117 + { 1118 + struct bpf_prog *prog; 1119 + u32 ret; 1120 + 1121 + ret = XDP_PASS; 1122 + rcu_read_lock(); 1123 + prog = rcu_dereference(rq->xdp_prog); 1124 + if (prog) 1125 + ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats); 1126 + rcu_read_unlock(); 1127 + 1128 + switch (ret) { 1129 + case XDP_PASS: 1130 + return xsk_construct_skb(rq, xdp); 1131 + 1132 + case XDP_TX: 1133 + case XDP_REDIRECT: 1134 + return NULL; 1135 + 1136 + default: 1137 + /* drop packet */ 1138 + xsk_buff_free(xdp); 1139 + u64_stats_inc(&stats->drops); 1140 + return NULL; 1141 + } 1142 + } 1143 + 1144 + static void xsk_drop_follow_bufs(struct net_device *dev, 1145 + struct receive_queue *rq, 1146 + u32 num_buf, 1147 + struct virtnet_rq_stats *stats) 1148 + { 1149 + struct xdp_buff *xdp; 1150 + u32 len; 1151 + 1152 + while (num_buf-- > 1) { 1153 + xdp = virtqueue_get_buf(rq->vq, &len); 1154 + if (unlikely(!xdp)) { 1155 + pr_debug("%s: rx error: %d buffers missing\n", 1156 + dev->name, num_buf); 1157 + DEV_STATS_INC(dev, rx_length_errors); 1158 + break; 1159 + } 1160 + u64_stats_add(&stats->bytes, len); 1161 + xsk_buff_free(xdp); 1162 + } 1163 + } 1164 + 1165 + static int xsk_append_merge_buffer(struct virtnet_info *vi, 1166 + struct receive_queue *rq, 1167 + struct sk_buff *head_skb, 1168 + u32 num_buf, 1169 + struct virtio_net_hdr_mrg_rxbuf *hdr, 1170 + struct virtnet_rq_stats *stats) 1171 + { 1172 + struct sk_buff *curr_skb; 1173 + struct xdp_buff *xdp; 1174 + u32 len, truesize; 1175 + struct page *page; 1176 + void *buf; 1177 + 1178 + curr_skb = head_skb; 1179 + 1180 + while (--num_buf) { 1181 + buf = virtqueue_get_buf(rq->vq, &len); 1182 + if (unlikely(!buf)) { 1183 + pr_debug("%s: rx error: %d buffers out of %d missing\n", 1184 + vi->dev->name, num_buf, 1185 + virtio16_to_cpu(vi->vdev, 1186 + hdr->num_buffers)); 1187 + DEV_STATS_INC(vi->dev, rx_length_errors); 1188 + return -EINVAL; 1189 + } 1190 + 1191 + u64_stats_add(&stats->bytes, len); 1192 + 1193 + xdp = buf_to_xdp(vi, rq, buf, len); 1194 + if (!xdp) 1195 + goto err; 1196 + 1197 + buf = napi_alloc_frag(len); 1198 + if (!buf) { 1199 + xsk_buff_free(xdp); 1200 + goto err; 1201 + } 1202 + 1203 + memcpy(buf, xdp->data - vi->hdr_len, len); 1204 + 1205 + xsk_buff_free(xdp); 1206 + 1207 + page = virt_to_page(buf); 1208 + 1209 + truesize = len; 1210 + 1211 + curr_skb = virtnet_skb_append_frag(head_skb, curr_skb, page, 1212 + buf, len, truesize); 1213 + if (!curr_skb) { 1214 + put_page(page); 1215 + goto err; 1216 + } 1217 + } 1218 + 1219 + return 0; 1220 + 1221 + err: 1222 + xsk_drop_follow_bufs(vi->dev, rq, num_buf, stats); 1223 + return -EINVAL; 1224 + } 1225 + 1226 + static struct sk_buff *virtnet_receive_xsk_merge(struct net_device *dev, struct virtnet_info *vi, 1227 + struct receive_queue *rq, struct xdp_buff *xdp, 1228 + unsigned int *xdp_xmit, 1229 + struct virtnet_rq_stats *stats) 1230 + { 1231 + struct virtio_net_hdr_mrg_rxbuf *hdr; 1232 + struct bpf_prog *prog; 1233 + struct sk_buff *skb; 1234 + u32 ret, num_buf; 1235 + 1236 + hdr = xdp->data - vi->hdr_len; 1237 + num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers); 1238 + 1239 + ret = XDP_PASS; 1240 + rcu_read_lock(); 1241 + prog = rcu_dereference(rq->xdp_prog); 1242 + /* TODO: support multi buffer. */ 1243 + if (prog && num_buf == 1) 1244 + ret = virtnet_xdp_handler(prog, xdp, dev, xdp_xmit, stats); 1245 + rcu_read_unlock(); 1246 + 1247 + switch (ret) { 1248 + case XDP_PASS: 1249 + skb = xsk_construct_skb(rq, xdp); 1250 + if (!skb) 1251 + goto drop_bufs; 1252 + 1253 + if (xsk_append_merge_buffer(vi, rq, skb, num_buf, hdr, stats)) { 1254 + dev_kfree_skb(skb); 1255 + goto drop; 1256 + } 1257 + 1258 + return skb; 1259 + 1260 + case XDP_TX: 1261 + case XDP_REDIRECT: 1262 + return NULL; 1263 + 1264 + default: 1265 + /* drop packet */ 1266 + xsk_buff_free(xdp); 1267 + } 1268 + 1269 + drop_bufs: 1270 + xsk_drop_follow_bufs(dev, rq, num_buf, stats); 1271 + 1272 + drop: 1273 + u64_stats_inc(&stats->drops); 1274 + return NULL; 1275 + } 1276 + 1277 + static void virtnet_receive_xsk_buf(struct virtnet_info *vi, struct receive_queue *rq, 1278 + void *buf, u32 len, 1279 + unsigned int *xdp_xmit, 1280 + struct virtnet_rq_stats *stats) 1281 + { 1282 + struct net_device *dev = vi->dev; 1283 + struct sk_buff *skb = NULL; 1284 + struct xdp_buff *xdp; 1285 + u8 flags; 1286 + 1287 + len -= vi->hdr_len; 1288 + 1289 + u64_stats_add(&stats->bytes, len); 1290 + 1291 + xdp = buf_to_xdp(vi, rq, buf, len); 1292 + if (!xdp) 1293 + return; 1294 + 1295 + if (unlikely(len < ETH_HLEN)) { 1296 + pr_debug("%s: short packet %i\n", dev->name, len); 1297 + DEV_STATS_INC(dev, rx_length_errors); 1298 + xsk_buff_free(xdp); 1299 + return; 1300 + } 1301 + 1302 + flags = ((struct virtio_net_common_hdr *)(xdp->data - vi->hdr_len))->hdr.flags; 1303 + 1304 + if (!vi->mergeable_rx_bufs) 1305 + skb = virtnet_receive_xsk_small(dev, vi, rq, xdp, xdp_xmit, stats); 1306 + else 1307 + skb = virtnet_receive_xsk_merge(dev, vi, rq, xdp, xdp_xmit, stats); 1308 + 1309 + if (skb) 1310 + virtnet_receive_done(vi, rq, skb, flags); 1311 + } 1312 + 1313 + static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue *rq, 1314 + struct xsk_buff_pool *pool, gfp_t gfp) 1315 + { 1316 + struct xdp_buff **xsk_buffs; 1317 + dma_addr_t addr; 1318 + int err = 0; 1319 + u32 len, i; 1320 + int num; 1321 + 1322 + xsk_buffs = rq->xsk_buffs; 1323 + 1324 + num = xsk_buff_alloc_batch(pool, xsk_buffs, rq->vq->num_free); 1325 + if (!num) 1326 + return -ENOMEM; 1327 + 1328 + len = xsk_pool_get_rx_frame_size(pool) + vi->hdr_len; 1329 + 1330 + for (i = 0; i < num; ++i) { 1331 + /* Use the part of XDP_PACKET_HEADROOM as the virtnet hdr space. 1332 + * We assume XDP_PACKET_HEADROOM is larger than hdr->len. 1333 + * (see function virtnet_xsk_pool_enable) 1334 + */ 1335 + addr = xsk_buff_xdp_get_dma(xsk_buffs[i]) - vi->hdr_len; 1336 + 1337 + sg_init_table(rq->sg, 1); 1338 + sg_fill_dma(rq->sg, addr, len); 1339 + 1340 + err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, xsk_buffs[i], gfp); 1341 + if (err) 1342 + goto err; 1343 + } 1344 + 1345 + return num; 1346 + 1347 + err: 1348 + for (; i < num; ++i) 1349 + xsk_buff_free(xsk_buffs[i]); 1350 + 1351 + return err; 1352 + } 1353 + 1354 + static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag) 1355 + { 1356 + struct virtnet_info *vi = netdev_priv(dev); 1357 + struct send_queue *sq; 1358 + 1359 + if (!netif_running(dev)) 1360 + return -ENETDOWN; 1361 + 1362 + if (qid >= vi->curr_queue_pairs) 1363 + return -EINVAL; 1364 + 1365 + sq = &vi->sq[qid]; 1366 + 1367 + if (napi_if_scheduled_mark_missed(&sq->napi)) 1368 + return 0; 1369 + 1370 + local_bh_disable(); 1371 + virtqueue_napi_schedule(&sq->napi, sq->vq); 1372 + local_bh_enable(); 1373 + 1374 + return 0; 1072 1375 } 1073 1376 1074 1377 static int __virtnet_xdp_xmit_one(struct virtnet_info *vi, ··· 1611 1268 1612 1269 static unsigned int virtnet_get_headroom(struct virtnet_info *vi) 1613 1270 { 1614 - return vi->xdp_enabled ? VIRTIO_XDP_HEADROOM : 0; 1271 + return vi->xdp_enabled ? XDP_PACKET_HEADROOM : 0; 1615 1272 } 1616 1273 1617 1274 /* We copy the packet for XDP in the following cases: ··· 1675 1332 } 1676 1333 1677 1334 /* Headroom does not contribute to packet length */ 1678 - *len = page_off - VIRTIO_XDP_HEADROOM; 1335 + *len = page_off - XDP_PACKET_HEADROOM; 1679 1336 return page; 1680 1337 err_buf: 1681 1338 __free_pages(page, 0); ··· 1962 1619 void *ctx; 1963 1620 1964 1621 xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq); 1965 - xdp_prepare_buff(xdp, buf - VIRTIO_XDP_HEADROOM, 1966 - VIRTIO_XDP_HEADROOM + vi->hdr_len, len - vi->hdr_len, true); 1622 + xdp_prepare_buff(xdp, buf - XDP_PACKET_HEADROOM, 1623 + XDP_PACKET_HEADROOM + vi->hdr_len, len - vi->hdr_len, true); 1967 1624 1968 1625 if (!*num_buf) 1969 1626 return 0; ··· 2080 1737 /* linearize data for XDP */ 2081 1738 xdp_page = xdp_linearize_page(rq, num_buf, 2082 1739 *page, offset, 2083 - VIRTIO_XDP_HEADROOM, 1740 + XDP_PACKET_HEADROOM, 2084 1741 len); 2085 1742 if (!xdp_page) 2086 1743 return NULL; 2087 1744 } else { 2088 - xdp_room = SKB_DATA_ALIGN(VIRTIO_XDP_HEADROOM + 1745 + xdp_room = SKB_DATA_ALIGN(XDP_PACKET_HEADROOM + 2089 1746 sizeof(struct skb_shared_info)); 2090 1747 if (*len + xdp_room > PAGE_SIZE) 2091 1748 return NULL; ··· 2094 1751 if (!xdp_page) 2095 1752 return NULL; 2096 1753 2097 - memcpy(page_address(xdp_page) + VIRTIO_XDP_HEADROOM, 1754 + memcpy(page_address(xdp_page) + XDP_PACKET_HEADROOM, 2098 1755 page_address(*page) + offset, *len); 2099 1756 } 2100 1757 ··· 2104 1761 2105 1762 *page = xdp_page; 2106 1763 2107 - return page_address(*page) + VIRTIO_XDP_HEADROOM; 1764 + return page_address(*page) + XDP_PACKET_HEADROOM; 2108 1765 } 2109 1766 2110 1767 static struct sk_buff *receive_mergeable_xdp(struct net_device *dev, ··· 2167 1824 return NULL; 2168 1825 } 2169 1826 1827 + static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb, 1828 + struct sk_buff *curr_skb, 1829 + struct page *page, void *buf, 1830 + int len, int truesize) 1831 + { 1832 + int num_skb_frags; 1833 + int offset; 1834 + 1835 + num_skb_frags = skb_shinfo(curr_skb)->nr_frags; 1836 + if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) { 1837 + struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC); 1838 + 1839 + if (unlikely(!nskb)) 1840 + return NULL; 1841 + 1842 + if (curr_skb == head_skb) 1843 + skb_shinfo(curr_skb)->frag_list = nskb; 1844 + else 1845 + curr_skb->next = nskb; 1846 + curr_skb = nskb; 1847 + head_skb->truesize += nskb->truesize; 1848 + num_skb_frags = 0; 1849 + } 1850 + 1851 + if (curr_skb != head_skb) { 1852 + head_skb->data_len += len; 1853 + head_skb->len += len; 1854 + head_skb->truesize += truesize; 1855 + } 1856 + 1857 + offset = buf - page_address(page); 1858 + if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { 1859 + put_page(page); 1860 + skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, 1861 + len, truesize); 1862 + } else { 1863 + skb_add_rx_frag(curr_skb, num_skb_frags, page, 1864 + offset, len, truesize); 1865 + } 1866 + 1867 + return curr_skb; 1868 + } 1869 + 2170 1870 static struct sk_buff *receive_mergeable(struct net_device *dev, 2171 1871 struct virtnet_info *vi, 2172 1872 struct receive_queue *rq, ··· 2259 1873 if (unlikely(!curr_skb)) 2260 1874 goto err_skb; 2261 1875 while (--num_buf) { 2262 - int num_skb_frags; 2263 - 2264 1876 buf = virtnet_rq_get_buf(rq, &len, &ctx); 2265 1877 if (unlikely(!buf)) { 2266 1878 pr_debug("%s: rx error: %d buffers out of %d missing\n", ··· 2283 1899 goto err_skb; 2284 1900 } 2285 1901 2286 - num_skb_frags = skb_shinfo(curr_skb)->nr_frags; 2287 - if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) { 2288 - struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC); 2289 - 2290 - if (unlikely(!nskb)) 2291 - goto err_skb; 2292 - if (curr_skb == head_skb) 2293 - skb_shinfo(curr_skb)->frag_list = nskb; 2294 - else 2295 - curr_skb->next = nskb; 2296 - curr_skb = nskb; 2297 - head_skb->truesize += nskb->truesize; 2298 - num_skb_frags = 0; 2299 - } 2300 - if (curr_skb != head_skb) { 2301 - head_skb->data_len += len; 2302 - head_skb->len += len; 2303 - head_skb->truesize += truesize; 2304 - } 2305 - offset = buf - page_address(page); 2306 - if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { 2307 - put_page(page); 2308 - skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, 2309 - len, truesize); 2310 - } else { 2311 - skb_add_rx_frag(curr_skb, num_skb_frags, page, 2312 - offset, len, truesize); 2313 - } 1902 + curr_skb = virtnet_skb_append_frag(head_skb, curr_skb, page, 1903 + buf, len, truesize); 1904 + if (!curr_skb) 1905 + goto err_skb; 2314 1906 } 2315 1907 2316 1908 ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len); ··· 2331 1971 skb_set_hash(skb, __le32_to_cpu(hdr_hash->hash_value), rss_hash_type); 2332 1972 } 2333 1973 1974 + static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq, 1975 + struct sk_buff *skb, u8 flags) 1976 + { 1977 + struct virtio_net_common_hdr *hdr; 1978 + struct net_device *dev = vi->dev; 1979 + 1980 + hdr = skb_vnet_common_hdr(skb); 1981 + if (dev->features & NETIF_F_RXHASH && vi->has_rss_hash_report) 1982 + virtio_skb_set_hash(&hdr->hash_v1_hdr, skb); 1983 + 1984 + if (flags & VIRTIO_NET_HDR_F_DATA_VALID) 1985 + skb->ip_summed = CHECKSUM_UNNECESSARY; 1986 + 1987 + if (virtio_net_hdr_to_skb(skb, &hdr->hdr, 1988 + virtio_is_little_endian(vi->vdev))) { 1989 + net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n", 1990 + dev->name, hdr->hdr.gso_type, 1991 + hdr->hdr.gso_size); 1992 + goto frame_err; 1993 + } 1994 + 1995 + skb_record_rx_queue(skb, vq2rxq(rq->vq)); 1996 + skb->protocol = eth_type_trans(skb, dev); 1997 + pr_debug("Receiving skb proto 0x%04x len %i type %i\n", 1998 + ntohs(skb->protocol), skb->len, skb->pkt_type); 1999 + 2000 + napi_gro_receive(&rq->napi, skb); 2001 + return; 2002 + 2003 + frame_err: 2004 + DEV_STATS_INC(dev, rx_frame_errors); 2005 + dev_kfree_skb(skb); 2006 + } 2007 + 2334 2008 static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, 2335 2009 void *buf, unsigned int len, void **ctx, 2336 2010 unsigned int *xdp_xmit, ··· 2372 1978 { 2373 1979 struct net_device *dev = vi->dev; 2374 1980 struct sk_buff *skb; 2375 - struct virtio_net_common_hdr *hdr; 2376 1981 u8 flags; 2377 1982 2378 1983 if (unlikely(len < vi->hdr_len + ETH_HLEN)) { ··· 2401 2008 if (unlikely(!skb)) 2402 2009 return; 2403 2010 2404 - hdr = skb_vnet_common_hdr(skb); 2405 - if (dev->features & NETIF_F_RXHASH && vi->has_rss_hash_report) 2406 - virtio_skb_set_hash(&hdr->hash_v1_hdr, skb); 2407 - 2408 - if (flags & VIRTIO_NET_HDR_F_DATA_VALID) 2409 - skb->ip_summed = CHECKSUM_UNNECESSARY; 2410 - 2411 - if (virtio_net_hdr_to_skb(skb, &hdr->hdr, 2412 - virtio_is_little_endian(vi->vdev))) { 2413 - net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n", 2414 - dev->name, hdr->hdr.gso_type, 2415 - hdr->hdr.gso_size); 2416 - goto frame_err; 2417 - } 2418 - 2419 - skb_record_rx_queue(skb, vq2rxq(rq->vq)); 2420 - skb->protocol = eth_type_trans(skb, dev); 2421 - pr_debug("Receiving skb proto 0x%04x len %i type %i\n", 2422 - ntohs(skb->protocol), skb->len, skb->pkt_type); 2423 - 2424 - napi_gro_receive(&rq->napi, skb); 2425 - return; 2426 - 2427 - frame_err: 2428 - DEV_STATS_INC(dev, rx_frame_errors); 2429 - dev_kfree_skb(skb); 2011 + virtnet_receive_done(vi, rq, skb, flags); 2430 2012 } 2431 2013 2432 2014 /* Unlike mergeable buffers, all buffers are allocated to the ··· 2562 2194 gfp_t gfp) 2563 2195 { 2564 2196 int err; 2565 - bool oom; 2197 + 2198 + if (rq->xsk_pool) { 2199 + err = virtnet_add_recvbuf_xsk(vi, rq, rq->xsk_pool, gfp); 2200 + goto kick; 2201 + } 2566 2202 2567 2203 do { 2568 2204 if (vi->mergeable_rx_bufs) ··· 2576 2204 else 2577 2205 err = add_recvbuf_small(vi, rq, gfp); 2578 2206 2579 - oom = err == -ENOMEM; 2580 2207 if (err) 2581 2208 break; 2582 2209 } while (rq->vq->num_free); 2210 + 2211 + kick: 2583 2212 if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) { 2584 2213 unsigned long flags; 2585 2214 ··· 2589 2216 u64_stats_update_end_irqrestore(&rq->stats.syncp, flags); 2590 2217 } 2591 2218 2592 - return !oom; 2219 + return err != -ENOMEM; 2593 2220 } 2594 2221 2595 2222 static void skb_recv_done(struct virtqueue *rvq) ··· 2660 2287 } 2661 2288 } 2662 2289 2663 - static int virtnet_receive(struct receive_queue *rq, int budget, 2664 - unsigned int *xdp_xmit) 2290 + static int virtnet_receive_xsk_bufs(struct virtnet_info *vi, 2291 + struct receive_queue *rq, 2292 + int budget, 2293 + unsigned int *xdp_xmit, 2294 + struct virtnet_rq_stats *stats) 2665 2295 { 2666 - struct virtnet_info *vi = rq->vq->vdev->priv; 2667 - struct virtnet_rq_stats stats = {}; 2668 2296 unsigned int len; 2669 2297 int packets = 0; 2670 2298 void *buf; 2671 - int i; 2299 + 2300 + while (packets < budget) { 2301 + buf = virtqueue_get_buf(rq->vq, &len); 2302 + if (!buf) 2303 + break; 2304 + 2305 + virtnet_receive_xsk_buf(vi, rq, buf, len, xdp_xmit, stats); 2306 + packets++; 2307 + } 2308 + 2309 + return packets; 2310 + } 2311 + 2312 + static int virtnet_receive_packets(struct virtnet_info *vi, 2313 + struct receive_queue *rq, 2314 + int budget, 2315 + unsigned int *xdp_xmit, 2316 + struct virtnet_rq_stats *stats) 2317 + { 2318 + unsigned int len; 2319 + int packets = 0; 2320 + void *buf; 2672 2321 2673 2322 if (!vi->big_packets || vi->mergeable_rx_bufs) { 2674 2323 void *ctx; 2675 - 2676 2324 while (packets < budget && 2677 2325 (buf = virtnet_rq_get_buf(rq, &len, &ctx))) { 2678 - receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats); 2326 + receive_buf(vi, rq, buf, len, ctx, xdp_xmit, stats); 2679 2327 packets++; 2680 2328 } 2681 2329 } else { 2682 2330 while (packets < budget && 2683 2331 (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) { 2684 - receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats); 2332 + receive_buf(vi, rq, buf, len, NULL, xdp_xmit, stats); 2685 2333 packets++; 2686 2334 } 2687 2335 } 2336 + 2337 + return packets; 2338 + } 2339 + 2340 + static int virtnet_receive(struct receive_queue *rq, int budget, 2341 + unsigned int *xdp_xmit) 2342 + { 2343 + struct virtnet_info *vi = rq->vq->vdev->priv; 2344 + struct virtnet_rq_stats stats = {}; 2345 + int i, packets; 2346 + 2347 + if (rq->xsk_pool) 2348 + packets = virtnet_receive_xsk_bufs(vi, rq, budget, xdp_xmit, &stats); 2349 + else 2350 + packets = virtnet_receive_packets(vi, rq, budget, xdp_xmit, &stats); 2688 2351 2689 2352 if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) { 2690 2353 if (!try_fill_recv(vi, rq, GFP_ATOMIC)) { ··· 3077 2668 return NETDEV_TX_OK; 3078 2669 } 3079 2670 3080 - static int virtnet_rx_resize(struct virtnet_info *vi, 3081 - struct receive_queue *rq, u32 ring_num) 2671 + static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq) 3082 2672 { 3083 2673 bool running = netif_running(vi->dev); 3084 - int err, qindex; 3085 - 3086 - qindex = rq - vi->rq; 3087 2674 3088 2675 if (running) { 3089 2676 napi_disable(&rq->napi); 3090 2677 virtnet_cancel_dim(vi, &rq->dim); 3091 2678 } 2679 + } 3092 2680 3093 - err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf); 3094 - if (err) 3095 - netdev_err(vi->dev, "resize rx fail: rx queue index: %d err: %d\n", qindex, err); 2681 + static void virtnet_rx_resume(struct virtnet_info *vi, struct receive_queue *rq) 2682 + { 2683 + bool running = netif_running(vi->dev); 3096 2684 3097 2685 if (!try_fill_recv(vi, rq, GFP_KERNEL)) 3098 2686 schedule_delayed_work(&vi->refill, 0); 3099 2687 3100 2688 if (running) 3101 2689 virtnet_napi_enable(rq->vq, &rq->napi); 2690 + } 2691 + 2692 + static int virtnet_rx_resize(struct virtnet_info *vi, 2693 + struct receive_queue *rq, u32 ring_num) 2694 + { 2695 + int err, qindex; 2696 + 2697 + qindex = rq - vi->rq; 2698 + 2699 + virtnet_rx_pause(vi, rq); 2700 + 2701 + err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf); 2702 + if (err) 2703 + netdev_err(vi->dev, "resize rx fail: rx queue index: %d err: %d\n", qindex, err); 2704 + 2705 + virtnet_rx_resume(vi, rq); 3102 2706 return err; 3103 2707 } 3104 2708 3105 - static int virtnet_tx_resize(struct virtnet_info *vi, 3106 - struct send_queue *sq, u32 ring_num) 2709 + static void virtnet_tx_pause(struct virtnet_info *vi, struct send_queue *sq) 3107 2710 { 3108 2711 bool running = netif_running(vi->dev); 3109 2712 struct netdev_queue *txq; 3110 - int err, qindex; 2713 + int qindex; 3111 2714 3112 2715 qindex = sq - vi->sq; 3113 2716 ··· 3140 2719 netif_stop_subqueue(vi->dev, qindex); 3141 2720 3142 2721 __netif_tx_unlock_bh(txq); 2722 + } 3143 2723 3144 - err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf); 3145 - if (err) 3146 - netdev_err(vi->dev, "resize tx fail: tx queue index: %d err: %d\n", qindex, err); 2724 + static void virtnet_tx_resume(struct virtnet_info *vi, struct send_queue *sq) 2725 + { 2726 + bool running = netif_running(vi->dev); 2727 + struct netdev_queue *txq; 2728 + int qindex; 2729 + 2730 + qindex = sq - vi->sq; 2731 + 2732 + txq = netdev_get_tx_queue(vi->dev, qindex); 3147 2733 3148 2734 __netif_tx_lock_bh(txq); 3149 2735 sq->reset = false; ··· 3159 2731 3160 2732 if (running) 3161 2733 virtnet_napi_tx_enable(vi, sq->vq, &sq->napi); 2734 + } 2735 + 2736 + static int virtnet_tx_resize(struct virtnet_info *vi, struct send_queue *sq, 2737 + u32 ring_num) 2738 + { 2739 + int qindex, err; 2740 + 2741 + qindex = sq - vi->sq; 2742 + 2743 + virtnet_tx_pause(vi, sq); 2744 + 2745 + err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf); 2746 + if (err) 2747 + netdev_err(vi->dev, "resize tx fail: tx queue index: %d err: %d\n", qindex, err); 2748 + 2749 + virtnet_tx_resume(vi, sq); 2750 + 3162 2751 return err; 3163 2752 } 3164 2753 ··· 5413 4968 return virtnet_set_guest_offloads(vi, offloads); 5414 4969 } 5415 4970 4971 + static int virtnet_rq_bind_xsk_pool(struct virtnet_info *vi, struct receive_queue *rq, 4972 + struct xsk_buff_pool *pool) 4973 + { 4974 + int err, qindex; 4975 + 4976 + qindex = rq - vi->rq; 4977 + 4978 + if (pool) { 4979 + err = xdp_rxq_info_reg(&rq->xsk_rxq_info, vi->dev, qindex, rq->napi.napi_id); 4980 + if (err < 0) 4981 + return err; 4982 + 4983 + err = xdp_rxq_info_reg_mem_model(&rq->xsk_rxq_info, 4984 + MEM_TYPE_XSK_BUFF_POOL, NULL); 4985 + if (err < 0) 4986 + goto unreg; 4987 + 4988 + xsk_pool_set_rxq_info(pool, &rq->xsk_rxq_info); 4989 + } 4990 + 4991 + virtnet_rx_pause(vi, rq); 4992 + 4993 + err = virtqueue_reset(rq->vq, virtnet_rq_unmap_free_buf); 4994 + if (err) { 4995 + netdev_err(vi->dev, "reset rx fail: rx queue index: %d err: %d\n", qindex, err); 4996 + 4997 + pool = NULL; 4998 + } 4999 + 5000 + rq->xsk_pool = pool; 5001 + 5002 + virtnet_rx_resume(vi, rq); 5003 + 5004 + if (pool) 5005 + return 0; 5006 + 5007 + unreg: 5008 + xdp_rxq_info_unreg(&rq->xsk_rxq_info); 5009 + return err; 5010 + } 5011 + 5012 + static int virtnet_xsk_pool_enable(struct net_device *dev, 5013 + struct xsk_buff_pool *pool, 5014 + u16 qid) 5015 + { 5016 + struct virtnet_info *vi = netdev_priv(dev); 5017 + struct receive_queue *rq; 5018 + struct device *dma_dev; 5019 + struct send_queue *sq; 5020 + int err, size; 5021 + 5022 + if (vi->hdr_len > xsk_pool_get_headroom(pool)) 5023 + return -EINVAL; 5024 + 5025 + /* In big_packets mode, xdp cannot work, so there is no need to 5026 + * initialize xsk of rq. 5027 + */ 5028 + if (vi->big_packets && !vi->mergeable_rx_bufs) 5029 + return -ENOENT; 5030 + 5031 + if (qid >= vi->curr_queue_pairs) 5032 + return -EINVAL; 5033 + 5034 + sq = &vi->sq[qid]; 5035 + rq = &vi->rq[qid]; 5036 + 5037 + /* xsk assumes that tx and rx must have the same dma device. The af-xdp 5038 + * may use one buffer to receive from the rx and reuse this buffer to 5039 + * send by the tx. So the dma dev of sq and rq must be the same one. 5040 + * 5041 + * But vq->dma_dev allows every vq has the respective dma dev. So I 5042 + * check the dma dev of vq and sq is the same dev. 5043 + */ 5044 + if (virtqueue_dma_dev(rq->vq) != virtqueue_dma_dev(sq->vq)) 5045 + return -EINVAL; 5046 + 5047 + dma_dev = virtqueue_dma_dev(rq->vq); 5048 + if (!dma_dev) 5049 + return -EINVAL; 5050 + 5051 + size = virtqueue_get_vring_size(rq->vq); 5052 + 5053 + rq->xsk_buffs = kvcalloc(size, sizeof(*rq->xsk_buffs), GFP_KERNEL); 5054 + if (!rq->xsk_buffs) 5055 + return -ENOMEM; 5056 + 5057 + err = xsk_pool_dma_map(pool, dma_dev, 0); 5058 + if (err) 5059 + goto err_xsk_map; 5060 + 5061 + err = virtnet_rq_bind_xsk_pool(vi, rq, pool); 5062 + if (err) 5063 + goto err_rq; 5064 + 5065 + return 0; 5066 + 5067 + err_rq: 5068 + xsk_pool_dma_unmap(pool, 0); 5069 + err_xsk_map: 5070 + return err; 5071 + } 5072 + 5073 + static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid) 5074 + { 5075 + struct virtnet_info *vi = netdev_priv(dev); 5076 + struct xsk_buff_pool *pool; 5077 + struct receive_queue *rq; 5078 + int err; 5079 + 5080 + if (qid >= vi->curr_queue_pairs) 5081 + return -EINVAL; 5082 + 5083 + rq = &vi->rq[qid]; 5084 + 5085 + pool = rq->xsk_pool; 5086 + 5087 + err = virtnet_rq_bind_xsk_pool(vi, rq, NULL); 5088 + 5089 + xsk_pool_dma_unmap(pool, 0); 5090 + 5091 + kvfree(rq->xsk_buffs); 5092 + 5093 + return err; 5094 + } 5095 + 5096 + static int virtnet_xsk_pool_setup(struct net_device *dev, struct netdev_bpf *xdp) 5097 + { 5098 + if (xdp->xsk.pool) 5099 + return virtnet_xsk_pool_enable(dev, xdp->xsk.pool, 5100 + xdp->xsk.queue_id); 5101 + else 5102 + return virtnet_xsk_pool_disable(dev, xdp->xsk.queue_id); 5103 + } 5104 + 5416 5105 static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, 5417 5106 struct netlink_ext_ack *extack) 5418 5107 { 5419 - unsigned int room = SKB_DATA_ALIGN(VIRTIO_XDP_HEADROOM + 5108 + unsigned int room = SKB_DATA_ALIGN(XDP_PACKET_HEADROOM + 5420 5109 sizeof(struct skb_shared_info)); 5421 5110 unsigned int max_sz = PAGE_SIZE - room - ETH_HLEN; 5422 5111 struct virtnet_info *vi = netdev_priv(dev); ··· 5672 5093 switch (xdp->command) { 5673 5094 case XDP_SETUP_PROG: 5674 5095 return virtnet_xdp_set(dev, xdp->prog, xdp->extack); 5096 + case XDP_SETUP_XSK_POOL: 5097 + return virtnet_xsk_pool_setup(dev, xdp); 5675 5098 default: 5676 5099 return -EINVAL; 5677 5100 } ··· 5788 5207 .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid, 5789 5208 .ndo_bpf = virtnet_xdp, 5790 5209 .ndo_xdp_xmit = virtnet_xdp_xmit, 5210 + .ndo_xsk_wakeup = virtnet_xsk_wakeup, 5791 5211 .ndo_features_check = passthru_features_check, 5792 5212 .ndo_get_phys_port_name = virtnet_get_phys_port_name, 5793 5213 .ndo_set_features = virtnet_set_features,