Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'ras_urgent_for_v6.16_rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull RAS fixes from Borislav Petkov:

- Do not remove the MCE sysfs hierarchy if thresholding sysfs nodes
init fails due to new/unknown banks present, which in itself is not
fatal anyway; add default names for new banks

- Make sure MCE polling settings are honored after CMCI storms

- Make sure MCE threshold limit is reset after the thresholding
interrupt has been serviced

- Clean up properly and disable CMCI banks on shutdown so that a
second/kexec-ed kernel can rediscover those banks again

* tag 'ras_urgent_for_v6.16_rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/mce: Make sure CMCI banks are cleared during shutdown on Intel
x86/mce/amd: Fix threshold limit reset
x86/mce/amd: Add default names for MCA banks and blocks
x86/mce: Ensure user polling settings are honored when restarting timer
x86/mce: Don't remove sysfs if thresholding sysfs init fails

+29 -24
+17 -11
arch/x86/kernel/cpu/mce/amd.c
··· 350 350 351 351 struct thresh_restart { 352 352 struct threshold_block *b; 353 - int reset; 354 353 int set_lvt_off; 355 354 int lvt_off; 356 355 u16 old_limit; ··· 431 432 432 433 rdmsr(tr->b->address, lo, hi); 433 434 434 - if (tr->b->threshold_limit < (hi & THRESHOLD_MAX)) 435 - tr->reset = 1; /* limit cannot be lower than err count */ 436 - 437 - if (tr->reset) { /* reset err count and overflow bit */ 438 - hi = 439 - (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | 440 - (THRESHOLD_MAX - tr->b->threshold_limit); 435 + /* 436 + * Reset error count and overflow bit. 437 + * This is done during init or after handling an interrupt. 438 + */ 439 + if (hi & MASK_OVERFLOW_HI || tr->set_lvt_off) { 440 + hi &= ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI); 441 + hi |= THRESHOLD_MAX - tr->b->threshold_limit; 441 442 } else if (tr->old_limit) { /* change limit w/o reset */ 442 443 int new_count = (hi & THRESHOLD_MAX) + 443 444 (tr->old_limit - tr->b->threshold_limit); ··· 1112 1113 } 1113 1114 1114 1115 bank_type = smca_get_bank_type(cpu, bank); 1115 - if (bank_type >= N_SMCA_BANK_TYPES) 1116 - return NULL; 1117 1116 1118 1117 if (b && (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2)) { 1119 1118 if (b->block < ARRAY_SIZE(smca_umc_block_names)) 1120 1119 return smca_umc_block_names[b->block]; 1121 - return NULL; 1120 + } 1121 + 1122 + if (b && b->block) { 1123 + snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "th_block_%u", b->block); 1124 + return buf_mcatype; 1125 + } 1126 + 1127 + if (bank_type >= N_SMCA_BANK_TYPES) { 1128 + snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "th_bank_%u", bank); 1129 + return buf_mcatype; 1122 1130 } 1123 1131 1124 1132 if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1)
+11 -13
arch/x86/kernel/cpu/mce/core.c
··· 1740 1740 1741 1741 void (*mc_poll_banks)(void) = mc_poll_banks_default; 1742 1742 1743 + static bool should_enable_timer(unsigned long iv) 1744 + { 1745 + return !mca_cfg.ignore_ce && iv; 1746 + } 1747 + 1743 1748 static void mce_timer_fn(struct timer_list *t) 1744 1749 { 1745 1750 struct timer_list *cpu_t = this_cpu_ptr(&mce_timer); ··· 1768 1763 1769 1764 if (mce_get_storm_mode()) { 1770 1765 __start_timer(t, HZ); 1771 - } else { 1766 + } else if (should_enable_timer(iv)) { 1772 1767 __this_cpu_write(mce_next_interval, iv); 1773 1768 __start_timer(t, iv); 1774 1769 } ··· 2161 2156 { 2162 2157 unsigned long iv = check_interval * HZ; 2163 2158 2164 - if (mca_cfg.ignore_ce || !iv) 2165 - return; 2166 - 2167 - this_cpu_write(mce_next_interval, iv); 2168 - __start_timer(t, iv); 2159 + if (should_enable_timer(iv)) { 2160 + this_cpu_write(mce_next_interval, iv); 2161 + __start_timer(t, iv); 2162 + } 2169 2163 } 2170 2164 2171 2165 static void __mcheck_cpu_setup_timer(void) ··· 2805 2801 static int mce_cpu_online(unsigned int cpu) 2806 2802 { 2807 2803 struct timer_list *t = this_cpu_ptr(&mce_timer); 2808 - int ret; 2809 2804 2810 2805 mce_device_create(cpu); 2811 - 2812 - ret = mce_threshold_create_device(cpu); 2813 - if (ret) { 2814 - mce_device_remove(cpu); 2815 - return ret; 2816 - } 2806 + mce_threshold_create_device(cpu); 2817 2807 mce_reenable_cpu(); 2818 2808 mce_start_timer(t); 2819 2809 return 0;
+1
arch/x86/kernel/cpu/mce/intel.c
··· 478 478 void mce_intel_feature_clear(struct cpuinfo_x86 *c) 479 479 { 480 480 intel_clear_lmce(); 481 + cmci_clear(); 481 482 } 482 483 483 484 bool intel_filter_mce(struct mce *m)