Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'net-af_packet-optimize-retire-operation'

Xin Zhao says:

====================
net: af_packet: optimize retire operation

In a system with high real-time requirements, the timeout mechanism of
ordinary timers with jiffies granularity is insufficient to meet the
demands for real-time performance. Meanwhile, the optimization of CPU
usage with af_packet is quite significant. Use hrtimer instead of timer
to help compensate for the shortcomings in real-time performance.
In HZ=100 or HZ=250 system, the update of TP_STATUS_USER is not real-time
enough, with fluctuations reaching over 8ms (on a system with HZ=250).
This is unacceptable in some high real-time systems that require timely
processing of network packets. By replacing it with hrtimer, if a timeout
of 2ms is set, the update of TP_STATUS_USER can be stabilized to within
3 ms.
====================

Link: https://patch.msgid.link/20250908104549.204412-1-jackzxcui1989@163.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+44 -104
+39 -93
net/packet/af_packet.c
··· 203 203 static int prb_queue_frozen(struct tpacket_kbdq_core *); 204 204 static void prb_open_block(struct tpacket_kbdq_core *, 205 205 struct tpacket_block_desc *); 206 - static void prb_retire_rx_blk_timer_expired(struct timer_list *); 207 - static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); 206 + static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *); 208 207 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); 209 208 static void prb_clear_rxhash(struct tpacket_kbdq_core *, 210 209 struct tpacket3_hdr *); ··· 578 579 return proto; 579 580 } 580 581 581 - static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) 582 - { 583 - timer_delete_sync(&pkc->retire_blk_timer); 584 - } 585 - 586 582 static void prb_shutdown_retire_blk_timer(struct packet_sock *po, 587 583 struct sk_buff_head *rb_queue) 588 584 { 589 585 struct tpacket_kbdq_core *pkc; 590 586 591 587 pkc = GET_PBDQC_FROM_RB(&po->rx_ring); 592 - 593 - spin_lock_bh(&rb_queue->lock); 594 - pkc->delete_blk_timer = 1; 595 - spin_unlock_bh(&rb_queue->lock); 596 - 597 - prb_del_retire_blk_timer(pkc); 598 - } 599 - 600 - static void prb_setup_retire_blk_timer(struct packet_sock *po) 601 - { 602 - struct tpacket_kbdq_core *pkc; 603 - 604 - pkc = GET_PBDQC_FROM_RB(&po->rx_ring); 605 - timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired, 606 - 0); 607 - pkc->retire_blk_timer.expires = jiffies; 588 + hrtimer_cancel(&pkc->retire_blk_timer); 608 589 } 609 590 610 591 static int prb_calc_retire_blk_tmo(struct packet_sock *po, ··· 648 669 p1->knum_blocks = req_u->req3.tp_block_nr; 649 670 p1->hdrlen = po->tp_hdrlen; 650 671 p1->version = po->tp_version; 651 - p1->last_kactive_blk_num = 0; 652 672 po->stats.stats3.tp_freeze_q_cnt = 0; 653 673 if (req_u->req3.tp_retire_blk_tov) 654 - p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; 674 + p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov); 655 675 else 656 - p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, 657 - req_u->req3.tp_block_size); 658 - p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); 676 + p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po, 677 + req_u->req3.tp_block_size)); 659 678 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; 660 679 rwlock_init(&p1->blk_fill_in_prog_lock); 661 680 662 681 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); 663 682 prb_init_ft_ops(p1, req_u); 664 - prb_setup_retire_blk_timer(po); 683 + hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired, 684 + CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); 685 + hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime, 686 + HRTIMER_MODE_REL_SOFT); 665 687 prb_open_block(p1, pbd); 666 688 } 667 689 668 - /* Do NOT update the last_blk_num first. 669 - * Assumes sk_buff_head lock is held. 670 - */ 671 - static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) 672 - { 673 - mod_timer(&pkc->retire_blk_timer, 674 - jiffies + pkc->tov_in_jiffies); 675 - pkc->last_kactive_blk_num = pkc->kactive_blk_num; 676 - } 677 - 678 690 /* 679 - * Timer logic: 680 - * 1) We refresh the timer only when we open a block. 681 - * By doing this we don't waste cycles refreshing the timer 682 - * on packet-by-packet basis. 683 - * 684 691 * With a 1MB block-size, on a 1Gbps line, it will take 685 692 * i) ~8 ms to fill a block + ii) memcpy etc. 686 693 * In this cut we are not accounting for the memcpy time. 687 694 * 688 - * So, if the user sets the 'tmo' to 10ms then the timer 689 - * will never fire while the block is still getting filled 690 - * (which is what we want). However, the user could choose 691 - * to close a block early and that's fine. 692 - * 693 - * But when the timer does fire, we check whether or not to refresh it. 694 695 * Since the tmo granularity is in msecs, it is not too expensive 695 696 * to refresh the timer, lets say every '8' msecs. 696 697 * Either the user can set the 'tmo' or we can derive it based on 697 698 * a) line-speed and b) block-size. 698 699 * prb_calc_retire_blk_tmo() calculates the tmo. 699 - * 700 700 */ 701 - static void prb_retire_rx_blk_timer_expired(struct timer_list *t) 701 + static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t) 702 702 { 703 703 struct packet_sock *po = 704 704 timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer); ··· 689 731 690 732 frozen = prb_queue_frozen(pkc); 691 733 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); 692 - 693 - if (unlikely(pkc->delete_blk_timer)) 694 - goto out; 695 734 696 735 /* We only need to plug the race when the block is partially filled. 697 736 * tpacket_rcv: ··· 705 750 write_unlock(&pkc->blk_fill_in_prog_lock); 706 751 } 707 752 708 - if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { 709 - if (!frozen) { 710 - if (!BLOCK_NUM_PKTS(pbd)) { 711 - /* An empty block. Just refresh the timer. */ 712 - goto refresh_timer; 713 - } 753 + if (!frozen) { 754 + if (BLOCK_NUM_PKTS(pbd)) { 755 + /* Not an empty block. Need retire the block. */ 714 756 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); 715 - if (!prb_dispatch_next_block(pkc, po)) 716 - goto refresh_timer; 717 - else 718 - goto out; 719 - } else { 720 - /* Case 1. Queue was frozen because user-space was 721 - * lagging behind. 757 + prb_dispatch_next_block(pkc, po); 758 + } 759 + } else { 760 + /* Case 1. Queue was frozen because user-space was 761 + * lagging behind. 762 + */ 763 + if (!prb_curr_blk_in_use(pbd)) { 764 + /* Case 2. queue was frozen,user-space caught up, 765 + * now the link went idle && the timer fired. 766 + * We don't have a block to close.So we open this 767 + * block and restart the timer. 768 + * opening a block thaws the queue,restarts timer 769 + * Thawing/timer-refresh is a side effect. 722 770 */ 723 - if (prb_curr_blk_in_use(pbd)) { 724 - /* 725 - * Ok, user-space is still behind. 726 - * So just refresh the timer. 727 - */ 728 - goto refresh_timer; 729 - } else { 730 - /* Case 2. queue was frozen,user-space caught up, 731 - * now the link went idle && the timer fired. 732 - * We don't have a block to close.So we open this 733 - * block and restart the timer. 734 - * opening a block thaws the queue,restarts timer 735 - * Thawing/timer-refresh is a side effect. 736 - */ 737 - prb_open_block(pkc, pbd); 738 - goto out; 739 - } 771 + prb_open_block(pkc, pbd); 740 772 } 741 773 } 742 774 743 - refresh_timer: 744 - _prb_refresh_rx_retire_blk_timer(pkc); 745 - 746 - out: 775 + hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime); 747 776 spin_unlock(&po->sk.sk_receive_queue.lock); 777 + return HRTIMER_RESTART; 748 778 } 749 779 750 780 static void prb_flush_block(struct tpacket_kbdq_core *pkc1, ··· 823 883 } 824 884 825 885 /* 826 - * Side effect of opening a block: 886 + * prb_open_block is called by tpacket_rcv or timer callback. 827 887 * 828 - * 1) prb_queue is thawed. 829 - * 2) retire_blk_timer is refreshed. 888 + * Reasons why NOT update hrtimer in prb_open_block: 889 + * 1) It will increase complexity to distinguish the two caller scenario. 890 + * 2) hrtimer_cancel and hrtimer_start need to be called if you want to update 891 + * TMO of an already enqueued hrtimer, leading to complex shutdown logic. 830 892 * 893 + * One side effect of NOT update hrtimer when called by tpacket_rcv is that 894 + * a newly opened block triggered by tpacket_rcv may be retired earlier than 895 + * expected. On the other hand, if timeout is updated in prb_open_block, the 896 + * frequent reception of network packets that leads to prb_open_block being 897 + * called may cause hrtimer to be removed and enqueued repeatedly. 831 898 */ 832 899 static void prb_open_block(struct tpacket_kbdq_core *pkc1, 833 900 struct tpacket_block_desc *pbd1) ··· 868 921 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; 869 922 870 923 prb_thaw_queue(pkc1); 871 - _prb_refresh_rx_retire_blk_timer(pkc1); 872 924 873 925 smp_wmb(); 874 926 }
+1 -1
net/packet/diag.c
··· 83 83 pdr.pdr_frame_nr = ring->frame_max + 1; 84 84 85 85 if (ver > TPACKET_V2) { 86 - pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov; 86 + pdr.pdr_retire_tmo = ktime_to_ms(ring->prb_bdqc.interval_ktime); 87 87 pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv; 88 88 pdr.pdr_features = ring->prb_bdqc.feature_req_word; 89 89 } else {
+4 -10
net/packet/internal.h
··· 20 20 unsigned int feature_req_word; 21 21 unsigned int hdrlen; 22 22 unsigned char reset_pending_on_curr_blk; 23 - unsigned char delete_blk_timer; 24 23 unsigned short kactive_blk_num; 25 24 unsigned short blk_sizeof_priv; 26 25 27 - /* last_kactive_blk_num: 28 - * trick to see if user-space has caught up 29 - * in order to avoid refreshing timer when every single pkt arrives. 30 - */ 31 - unsigned short last_kactive_blk_num; 26 + unsigned short version; 32 27 33 28 char *pkblk_start; 34 29 char *pkblk_end; ··· 33 38 uint64_t knxt_seq_num; 34 39 char *prev; 35 40 char *nxt_offset; 41 + 36 42 struct sk_buff *skb; 37 43 38 44 rwlock_t blk_fill_in_prog_lock; ··· 41 45 /* Default is set to 8ms */ 42 46 #define DEFAULT_PRB_RETIRE_TOV (8) 43 47 44 - unsigned short retire_blk_tov; 45 - unsigned short version; 46 - unsigned long tov_in_jiffies; 48 + ktime_t interval_ktime; 47 49 48 50 /* timer to retire an outstanding block */ 49 - struct timer_list retire_blk_timer; 51 + struct hrtimer retire_blk_timer; 50 52 }; 51 53 52 54 struct pgv {