Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

sched/fair: Increase weight bits for avg_vruntime

Due to the zero_vruntime patch, the deltas are now a lot smaller and
measurement with kernel-build and hackbench runs show about 45 bits
used.

This ensures avg_vruntime() tracks the full weight range, reducing
numerical artifacts in reweight and the like.

Also, lets keep the paranoid debug code around fow now.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Shubhang Kaushik <shubhang@os.amperecomputing.com>
Link: https://patch.msgid.link/20260219080624.942813440%40infradead.org

+94 -21
+13 -1
kernel/sched/debug.c
··· 8 8 */ 9 9 #include <linux/debugfs.h> 10 10 #include <linux/nmi.h> 11 + #include <linux/log2.h> 11 12 #include "sched.h" 12 13 13 14 /* ··· 902 901 903 902 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 904 903 { 905 - s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread; 904 + s64 left_vruntime = -1, right_vruntime = -1, left_deadline = -1, spread; 905 + s64 zero_vruntime = -1, sum_w_vruntime = -1; 906 906 struct sched_entity *last, *first, *root; 907 907 struct rq *rq = cpu_rq(cpu); 908 + unsigned int sum_shift; 908 909 unsigned long flags; 910 + u64 sum_weight; 909 911 910 912 #ifdef CONFIG_FAIR_GROUP_SCHED 911 913 SEQ_printf(m, "\n"); ··· 929 925 if (last) 930 926 right_vruntime = last->vruntime; 931 927 zero_vruntime = cfs_rq->zero_vruntime; 928 + sum_w_vruntime = cfs_rq->sum_w_vruntime; 929 + sum_weight = cfs_rq->sum_weight; 930 + sum_shift = cfs_rq->sum_shift; 932 931 raw_spin_rq_unlock_irqrestore(rq, flags); 933 932 934 933 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", ··· 940 933 SPLIT_NS(left_vruntime)); 941 934 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime", 942 935 SPLIT_NS(zero_vruntime)); 936 + SEQ_printf(m, " .%-30s: %Ld (%d bits)\n", "sum_w_vruntime", 937 + sum_w_vruntime, ilog2(abs(sum_w_vruntime))); 938 + SEQ_printf(m, " .%-30s: %Lu\n", "sum_weight", 939 + sum_weight); 940 + SEQ_printf(m, " .%-30s: %u\n", "sum_shift", sum_shift); 943 941 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", 944 942 SPLIT_NS(avg_vruntime(cfs_rq))); 945 943 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
+77 -19
kernel/sched/fair.c
··· 665 665 * Since zero_vruntime closely tracks the per-task service, these 666 666 * deltas: (v_i - v0), will be in the order of the maximal (virtual) lag 667 667 * induced in the system due to quantisation. 668 - * 669 - * Also, we use scale_load_down() to reduce the size. 670 - * 671 - * As measured, the max (key * weight) value was ~44 bits for a kernel build. 672 668 */ 669 + static inline unsigned long avg_vruntime_weight(struct cfs_rq *cfs_rq, unsigned long w) 670 + { 671 + #ifdef CONFIG_64BIT 672 + if (cfs_rq->sum_shift) 673 + w = max(2UL, w >> cfs_rq->sum_shift); 674 + #endif 675 + return w; 676 + } 677 + 678 + static inline void 679 + __sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) 680 + { 681 + unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight); 682 + s64 w_vruntime, key = entity_key(cfs_rq, se); 683 + 684 + w_vruntime = key * weight; 685 + WARN_ON_ONCE((w_vruntime >> 63) != (w_vruntime >> 62)); 686 + 687 + cfs_rq->sum_w_vruntime += w_vruntime; 688 + cfs_rq->sum_weight += weight; 689 + } 690 + 691 + static void 692 + sum_w_vruntime_add_paranoid(struct cfs_rq *cfs_rq, struct sched_entity *se) 693 + { 694 + unsigned long weight; 695 + s64 key, tmp; 696 + 697 + again: 698 + weight = avg_vruntime_weight(cfs_rq, se->load.weight); 699 + key = entity_key(cfs_rq, se); 700 + 701 + if (check_mul_overflow(key, weight, &key)) 702 + goto overflow; 703 + 704 + if (check_add_overflow(cfs_rq->sum_w_vruntime, key, &tmp)) 705 + goto overflow; 706 + 707 + cfs_rq->sum_w_vruntime = tmp; 708 + cfs_rq->sum_weight += weight; 709 + return; 710 + 711 + overflow: 712 + /* 713 + * There's gotta be a limit -- if we're still failing at this point 714 + * there's really nothing much to be done about things. 715 + */ 716 + BUG_ON(cfs_rq->sum_shift >= 10); 717 + cfs_rq->sum_shift++; 718 + 719 + /* 720 + * Note: \Sum (k_i * (w_i >> 1)) != (\Sum (k_i * w_i)) >> 1 721 + */ 722 + cfs_rq->sum_w_vruntime = 0; 723 + cfs_rq->sum_weight = 0; 724 + 725 + for (struct rb_node *node = cfs_rq->tasks_timeline.rb_leftmost; 726 + node; node = rb_next(node)) 727 + __sum_w_vruntime_add(cfs_rq, __node_2_se(node)); 728 + 729 + goto again; 730 + } 731 + 673 732 static void 674 733 sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) 675 734 { 676 - unsigned long weight = scale_load_down(se->load.weight); 677 - s64 key = entity_key(cfs_rq, se); 735 + if (sched_feat(PARANOID_AVG)) 736 + return sum_w_vruntime_add_paranoid(cfs_rq, se); 678 737 679 - cfs_rq->sum_w_vruntime += key * weight; 680 - cfs_rq->sum_weight += weight; 738 + __sum_w_vruntime_add(cfs_rq, se); 681 739 } 682 740 683 741 static void 684 742 sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) 685 743 { 686 - unsigned long weight = scale_load_down(se->load.weight); 744 + unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight); 687 745 s64 key = entity_key(cfs_rq, se); 688 746 689 747 cfs_rq->sum_w_vruntime -= key * weight; ··· 783 725 s64 runtime = cfs_rq->sum_w_vruntime; 784 726 785 727 if (curr) { 786 - unsigned long w = scale_load_down(curr->load.weight); 728 + unsigned long w = avg_vruntime_weight(cfs_rq, curr->load.weight); 787 729 788 730 runtime += entity_key(cfs_rq, curr) * w; 789 731 weight += w; ··· 793 735 if (runtime < 0) 794 736 runtime -= (weight - 1); 795 737 796 - delta = div_s64(runtime, weight); 738 + delta = div64_long(runtime, weight); 797 739 } else if (curr) { 798 740 /* 799 741 * When there is but one element, it is the average. ··· 859 801 long load = cfs_rq->sum_weight; 860 802 861 803 if (curr && curr->on_rq) { 862 - unsigned long weight = scale_load_down(curr->load.weight); 804 + unsigned long weight = avg_vruntime_weight(cfs_rq, curr->load.weight); 863 805 864 806 avg += entity_key(cfs_rq, curr) * weight; 865 807 load += weight; ··· 3929 3871 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), 3930 3872 * we need to scale se->vlag when w_i changes. 3931 3873 */ 3932 - se->vlag = div_s64(se->vlag * se->load.weight, weight); 3874 + se->vlag = div64_long(se->vlag * se->load.weight, weight); 3933 3875 if (se->rel_deadline) 3934 - se->deadline = div_s64(se->deadline * se->load.weight, weight); 3876 + se->deadline = div64_long(se->deadline * se->load.weight, weight); 3935 3877 3936 3878 if (rel_vprot) 3937 - vprot = div_s64(vprot * se->load.weight, weight); 3879 + vprot = div64_long(vprot * se->load.weight, weight); 3938 3880 3939 3881 update_load_set(&se->load, weight); 3940 3882 ··· 5238 5180 */ 5239 5181 if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) { 5240 5182 struct sched_entity *curr = cfs_rq->curr; 5241 - unsigned long load; 5183 + long load; 5242 5184 5243 5185 lag = se->vlag; 5244 5186 ··· 5296 5238 */ 5297 5239 load = cfs_rq->sum_weight; 5298 5240 if (curr && curr->on_rq) 5299 - load += scale_load_down(curr->load.weight); 5241 + load += avg_vruntime_weight(cfs_rq, curr->load.weight); 5300 5242 5301 - lag *= load + scale_load_down(se->load.weight); 5243 + lag *= load + avg_vruntime_weight(cfs_rq, se->load.weight); 5302 5244 if (WARN_ON_ONCE(!load)) 5303 5245 load = 1; 5304 - lag = div_s64(lag, load); 5246 + lag = div64_long(lag, load); 5305 5247 } 5306 5248 5307 5249 se->vruntime = vruntime - lag;
+2
kernel/sched/features.h
··· 58 58 SCHED_FEAT(DELAY_DEQUEUE, true) 59 59 SCHED_FEAT(DELAY_ZERO, true) 60 60 61 + SCHED_FEAT(PARANOID_AVG, false) 62 + 61 63 /* 62 64 * Allow wakeup-time preemption of the current task: 63 65 */
+2 -1
kernel/sched/sched.h
··· 684 684 685 685 s64 sum_w_vruntime; 686 686 u64 sum_weight; 687 - 688 687 u64 zero_vruntime; 688 + unsigned int sum_shift; 689 + 689 690 #ifdef CONFIG_SCHED_CORE 690 691 unsigned int forceidle_seq; 691 692 u64 zero_vruntime_fi;