Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull RAS updates from Ingo Molnar:
"Boris is on vacation so I'm sending the RAS bits this time. The main
changes were:

- Various RAS/CEC improvements and fixes by Borislav Petkov:
- error insertion fixes
- offlining latency fix
- memory leak fix
- additional sanity checks
- cleanups
- debug output improvements

- More SMCA enhancements by Yazen Ghannam:
- make banks truly per-CPU which they are in the hardware
- don't over-cache certain registers
- make the number of MCA banks per-CPU variable

The long term goal with these changes is to support future
heterogenous SMCA extensions.

- Misc fixes and improvements"

* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/mce: Do not check return value of debugfs_create functions
x86/MCE: Determine MCA banks' init state properly
x86/MCE: Make the number of MCA banks a per-CPU variable
x86/MCE/AMD: Don't cache block addresses on SMCA systems
x86/MCE: Make mce_banks a per-CPU array
x86/MCE: Make struct mce_banks[] static
RAS/CEC: Add copyright
RAS/CEC: Add CONFIG_RAS_CEC_DEBUG and move CEC debug features there
RAS/CEC: Dump the different array element sections
RAS/CEC: Rename count_threshold to action_threshold
RAS/CEC: Sanity-check array on every insertion
RAS/CEC: Fix potential memory leak
RAS/CEC: Do not set decay value on error
RAS/CEC: Check count_threshold unconditionally
RAS/CEC: Fix pfn insertion

+270 -206
+47 -45
arch/x86/kernel/cpu/mce/amd.c
··· 99 99 [SMCA_PCIE] = { "pcie", "PCI Express Unit" }, 100 100 }; 101 101 102 - static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init = 103 - { 104 - [0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 } 105 - }; 106 - 107 102 static const char *smca_get_name(enum smca_bank_types t) 108 103 { 109 104 if (t >= N_SMCA_BANK_TYPES) ··· 192 197 static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); 193 198 static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */ 194 199 200 + /* Map of banks that have more than MCA_MISC0 available. */ 201 + static DEFINE_PER_CPU(u32, smca_misc_banks_map); 202 + 195 203 static void amd_threshold_interrupt(void); 196 204 static void amd_deferred_error_interrupt(void); 197 205 ··· 203 205 pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR); 204 206 } 205 207 void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; 208 + 209 + static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu) 210 + { 211 + u32 low, high; 212 + 213 + /* 214 + * For SMCA enabled processors, BLKPTR field of the first MISC register 215 + * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). 216 + */ 217 + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) 218 + return; 219 + 220 + if (!(low & MCI_CONFIG_MCAX)) 221 + return; 222 + 223 + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high)) 224 + return; 225 + 226 + if (low & MASK_BLKPTR_LO) 227 + per_cpu(smca_misc_banks_map, cpu) |= BIT(bank); 228 + 229 + } 206 230 207 231 static void smca_configure(unsigned int bank, unsigned int cpu) 208 232 { ··· 262 242 263 243 wrmsr(smca_config, low, high); 264 244 } 245 + 246 + smca_set_misc_banks_map(bank, cpu); 265 247 266 248 /* Return early if this bank was already initialized. */ 267 249 if (smca_banks[bank].hwid) ··· 475 453 wrmsr(MSR_CU_DEF_ERR, low, high); 476 454 } 477 455 478 - static u32 smca_get_block_address(unsigned int bank, unsigned int block) 456 + static u32 smca_get_block_address(unsigned int bank, unsigned int block, 457 + unsigned int cpu) 479 458 { 480 - u32 low, high; 481 - u32 addr = 0; 482 - 483 - if (smca_get_bank_type(bank) == SMCA_RESERVED) 484 - return addr; 485 - 486 459 if (!block) 487 460 return MSR_AMD64_SMCA_MCx_MISC(bank); 488 461 489 - /* Check our cache first: */ 490 - if (smca_bank_addrs[bank][block] != -1) 491 - return smca_bank_addrs[bank][block]; 462 + if (!(per_cpu(smca_misc_banks_map, cpu) & BIT(bank))) 463 + return 0; 492 464 493 - /* 494 - * For SMCA enabled processors, BLKPTR field of the first MISC register 495 - * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). 496 - */ 497 - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) 498 - goto out; 499 - 500 - if (!(low & MCI_CONFIG_MCAX)) 501 - goto out; 502 - 503 - if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && 504 - (low & MASK_BLKPTR_LO)) 505 - addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); 506 - 507 - out: 508 - smca_bank_addrs[bank][block] = addr; 509 - return addr; 465 + return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); 510 466 } 511 467 512 468 static u32 get_block_address(u32 current_addr, u32 low, u32 high, 513 - unsigned int bank, unsigned int block) 469 + unsigned int bank, unsigned int block, 470 + unsigned int cpu) 514 471 { 515 472 u32 addr = 0, offset = 0; 516 473 517 - if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS)) 474 + if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS)) 518 475 return addr; 519 476 520 477 if (mce_flags.smca) 521 - return smca_get_block_address(bank, block); 478 + return smca_get_block_address(bank, block, cpu); 522 479 523 480 /* Fall back to method we used for older processors: */ 524 481 switch (block) { ··· 625 624 /* cpu init entry point, called from mce.c with preempt off */ 626 625 void mce_amd_feature_init(struct cpuinfo_x86 *c) 627 626 { 628 - u32 low = 0, high = 0, address = 0; 629 627 unsigned int bank, block, cpu = smp_processor_id(); 628 + u32 low = 0, high = 0, address = 0; 630 629 int offset = -1; 631 630 632 - for (bank = 0; bank < mca_cfg.banks; ++bank) { 631 + 632 + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { 633 633 if (mce_flags.smca) 634 634 smca_configure(bank, cpu); 635 635 636 636 disable_err_thresholding(c, bank); 637 637 638 638 for (block = 0; block < NR_BLOCKS; ++block) { 639 - address = get_block_address(address, low, high, bank, block); 639 + address = get_block_address(address, low, high, bank, block, cpu); 640 640 if (!address) 641 641 break; 642 642 ··· 975 973 { 976 974 unsigned int bank; 977 975 978 - for (bank = 0; bank < mca_cfg.banks; ++bank) 976 + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) 979 977 log_error_deferred(bank); 980 978 } 981 979 ··· 1016 1014 struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; 1017 1015 unsigned int bank, cpu = smp_processor_id(); 1018 1016 1019 - for (bank = 0; bank < mca_cfg.banks; ++bank) { 1017 + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { 1020 1018 if (!(per_cpu(bank_map, cpu) & (1 << bank))) 1021 1019 continue; 1022 1020 ··· 1203 1201 u32 low, high; 1204 1202 int err; 1205 1203 1206 - if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS)) 1204 + if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS)) 1207 1205 return 0; 1208 1206 1209 1207 if (rdmsr_safe_on_cpu(cpu, address, &low, &high)) ··· 1254 1252 if (err) 1255 1253 goto out_free; 1256 1254 recurse: 1257 - address = get_block_address(address, low, high, bank, ++block); 1255 + address = get_block_address(address, low, high, bank, ++block, cpu); 1258 1256 if (!address) 1259 1257 return 0; 1260 1258 ··· 1437 1435 { 1438 1436 unsigned int bank; 1439 1437 1440 - for (bank = 0; bank < mca_cfg.banks; ++bank) { 1438 + for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) { 1441 1439 if (!(per_cpu(bank_map, cpu) & (1 << bank))) 1442 1440 continue; 1443 1441 threshold_remove_bank(cpu, bank); ··· 1458 1456 if (bp) 1459 1457 return 0; 1460 1458 1461 - bp = kcalloc(mca_cfg.banks, sizeof(struct threshold_bank *), 1459 + bp = kcalloc(per_cpu(mce_num_banks, cpu), sizeof(struct threshold_bank *), 1462 1460 GFP_KERNEL); 1463 1461 if (!bp) 1464 1462 return -ENOMEM; 1465 1463 1466 1464 per_cpu(threshold_banks, cpu) = bp; 1467 1465 1468 - for (bank = 0; bank < mca_cfg.banks; ++bank) { 1466 + for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) { 1469 1467 if (!(per_cpu(bank_map, cpu) & (1 << bank))) 1470 1468 continue; 1471 1469 err = threshold_create_bank(cpu, bank);
+123 -56
arch/x86/kernel/cpu/mce/core.c
··· 65 65 66 66 DEFINE_PER_CPU(unsigned, mce_exception_count); 67 67 68 - struct mce_bank *mce_banks __read_mostly; 68 + DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks); 69 + 70 + struct mce_bank { 71 + u64 ctl; /* subevents to enable */ 72 + bool init; /* initialise bank? */ 73 + }; 74 + static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); 75 + 76 + #define ATTR_LEN 16 77 + /* One object for each MCE bank, shared by all CPUs */ 78 + struct mce_bank_dev { 79 + struct device_attribute attr; /* device attribute */ 80 + char attrname[ATTR_LEN]; /* attribute name */ 81 + u8 bank; /* bank number */ 82 + }; 83 + static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS]; 84 + 69 85 struct mce_vendor_flags mce_flags __read_mostly; 70 86 71 87 struct mca_config mca_cfg __read_mostly = { ··· 691 675 */ 692 676 bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) 693 677 { 678 + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); 694 679 bool error_seen = false; 695 680 struct mce m; 696 681 int i; ··· 703 686 if (flags & MCP_TIMESTAMP) 704 687 m.tsc = rdtsc(); 705 688 706 - for (i = 0; i < mca_cfg.banks; i++) { 689 + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { 707 690 if (!mce_banks[i].ctl || !test_bit(i, *b)) 708 691 continue; 709 692 ··· 805 788 char *tmp; 806 789 int i; 807 790 808 - for (i = 0; i < mca_cfg.banks; i++) { 791 + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { 809 792 m->status = mce_rdmsrl(msr_ops.status(i)); 810 793 if (!(m->status & MCI_STATUS_VAL)) 811 794 continue; ··· 1085 1068 { 1086 1069 int i; 1087 1070 1088 - for (i = 0; i < mca_cfg.banks; i++) { 1071 + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { 1089 1072 if (test_bit(i, toclear)) 1090 1073 mce_wrmsrl(msr_ops.status(i), 0); 1091 1074 } ··· 1139 1122 unsigned long *toclear, unsigned long *valid_banks, 1140 1123 int no_way_out, int *worst) 1141 1124 { 1125 + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); 1142 1126 struct mca_config *cfg = &mca_cfg; 1143 1127 int severity, i; 1144 1128 1145 - for (i = 0; i < cfg->banks; i++) { 1129 + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { 1146 1130 __clear_bit(i, toclear); 1147 1131 if (!test_bit(i, valid_banks)) 1148 1132 continue; ··· 1481 1463 } 1482 1464 EXPORT_SYMBOL_GPL(mce_notify_irq); 1483 1465 1484 - static int __mcheck_cpu_mce_banks_init(void) 1466 + static void __mcheck_cpu_mce_banks_init(void) 1485 1467 { 1468 + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); 1469 + u8 n_banks = this_cpu_read(mce_num_banks); 1486 1470 int i; 1487 1471 1488 - mce_banks = kcalloc(MAX_NR_BANKS, sizeof(struct mce_bank), GFP_KERNEL); 1489 - if (!mce_banks) 1490 - return -ENOMEM; 1491 - 1492 - for (i = 0; i < MAX_NR_BANKS; i++) { 1472 + for (i = 0; i < n_banks; i++) { 1493 1473 struct mce_bank *b = &mce_banks[i]; 1494 1474 1475 + /* 1476 + * Init them all, __mcheck_cpu_apply_quirks() is going to apply 1477 + * the required vendor quirks before 1478 + * __mcheck_cpu_init_clear_banks() does the final bank setup. 1479 + */ 1495 1480 b->ctl = -1ULL; 1496 1481 b->init = 1; 1497 1482 } 1498 - return 0; 1499 1483 } 1500 1484 1501 1485 /* 1502 1486 * Initialize Machine Checks for a CPU. 1503 1487 */ 1504 - static int __mcheck_cpu_cap_init(void) 1488 + static void __mcheck_cpu_cap_init(void) 1505 1489 { 1506 1490 u64 cap; 1507 1491 u8 b; ··· 1511 1491 rdmsrl(MSR_IA32_MCG_CAP, cap); 1512 1492 1513 1493 b = cap & MCG_BANKCNT_MASK; 1514 - if (WARN_ON_ONCE(b > MAX_NR_BANKS)) 1494 + 1495 + if (b > MAX_NR_BANKS) { 1496 + pr_warn("CPU%d: Using only %u machine check banks out of %u\n", 1497 + smp_processor_id(), MAX_NR_BANKS, b); 1515 1498 b = MAX_NR_BANKS; 1516 - 1517 - mca_cfg.banks = max(mca_cfg.banks, b); 1518 - 1519 - if (!mce_banks) { 1520 - int err = __mcheck_cpu_mce_banks_init(); 1521 - if (err) 1522 - return err; 1523 1499 } 1500 + 1501 + this_cpu_write(mce_num_banks, b); 1502 + 1503 + __mcheck_cpu_mce_banks_init(); 1524 1504 1525 1505 /* Use accurate RIP reporting if available. */ 1526 1506 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) ··· 1528 1508 1529 1509 if (cap & MCG_SER_P) 1530 1510 mca_cfg.ser = 1; 1531 - 1532 - return 0; 1533 1511 } 1534 1512 1535 1513 static void __mcheck_cpu_init_generic(void) ··· 1554 1536 1555 1537 static void __mcheck_cpu_init_clear_banks(void) 1556 1538 { 1539 + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); 1557 1540 int i; 1558 1541 1559 - for (i = 0; i < mca_cfg.banks; i++) { 1542 + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { 1560 1543 struct mce_bank *b = &mce_banks[i]; 1561 1544 1562 1545 if (!b->init) 1563 1546 continue; 1564 1547 wrmsrl(msr_ops.ctl(i), b->ctl); 1565 1548 wrmsrl(msr_ops.status(i), 0); 1549 + } 1550 + } 1551 + 1552 + /* 1553 + * Do a final check to see if there are any unused/RAZ banks. 1554 + * 1555 + * This must be done after the banks have been initialized and any quirks have 1556 + * been applied. 1557 + * 1558 + * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs. 1559 + * Otherwise, a user who disables a bank will not be able to re-enable it 1560 + * without a system reboot. 1561 + */ 1562 + static void __mcheck_cpu_check_banks(void) 1563 + { 1564 + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); 1565 + u64 msrval; 1566 + int i; 1567 + 1568 + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { 1569 + struct mce_bank *b = &mce_banks[i]; 1570 + 1571 + if (!b->init) 1572 + continue; 1573 + 1574 + rdmsrl(msr_ops.ctl(i), msrval); 1575 + b->init = !!msrval; 1566 1576 } 1567 1577 } 1568 1578 ··· 1625 1579 /* Add per CPU specific workarounds here */ 1626 1580 static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1627 1581 { 1582 + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); 1628 1583 struct mca_config *cfg = &mca_cfg; 1629 1584 1630 1585 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { ··· 1635 1588 1636 1589 /* This should be disabled by the BIOS, but isn't always */ 1637 1590 if (c->x86_vendor == X86_VENDOR_AMD) { 1638 - if (c->x86 == 15 && cfg->banks > 4) { 1591 + if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { 1639 1592 /* 1640 1593 * disable GART TBL walk error reporting, which 1641 1594 * trips off incorrectly with the IOMMU & 3ware ··· 1654 1607 * Various K7s with broken bank 0 around. Always disable 1655 1608 * by default. 1656 1609 */ 1657 - if (c->x86 == 6 && cfg->banks > 0) 1610 + if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0) 1658 1611 mce_banks[0].ctl = 0; 1659 1612 1660 1613 /* ··· 1676 1629 * valid event later, merely don't write CTL0. 1677 1630 */ 1678 1631 1679 - if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0) 1632 + if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) 1680 1633 mce_banks[0].init = 0; 1681 1634 1682 1635 /* ··· 1862 1815 if (!mce_available(c)) 1863 1816 return; 1864 1817 1865 - if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1818 + __mcheck_cpu_cap_init(); 1819 + 1820 + if (__mcheck_cpu_apply_quirks(c) < 0) { 1866 1821 mca_cfg.disabled = 1; 1867 1822 return; 1868 1823 } ··· 1881 1832 __mcheck_cpu_init_generic(); 1882 1833 __mcheck_cpu_init_vendor(c); 1883 1834 __mcheck_cpu_init_clear_banks(); 1835 + __mcheck_cpu_check_banks(); 1884 1836 __mcheck_cpu_setup_timer(); 1885 1837 } 1886 1838 ··· 1913 1863 1914 1864 void mce_disable_bank(int bank) 1915 1865 { 1916 - if (bank >= mca_cfg.banks) { 1866 + if (bank >= this_cpu_read(mce_num_banks)) { 1917 1867 pr_warn(FW_BUG 1918 1868 "Ignoring request to disable invalid MCA bank %d.\n", 1919 1869 bank); ··· 1999 1949 */ 2000 1950 static void mce_disable_error_reporting(void) 2001 1951 { 1952 + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); 2002 1953 int i; 2003 1954 2004 - for (i = 0; i < mca_cfg.banks; i++) { 1955 + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { 2005 1956 struct mce_bank *b = &mce_banks[i]; 2006 1957 2007 1958 if (b->init) ··· 2102 2051 2103 2052 DEFINE_PER_CPU(struct device *, mce_device); 2104 2053 2105 - static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) 2054 + static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr) 2106 2055 { 2107 - return container_of(attr, struct mce_bank, attr); 2056 + return container_of(attr, struct mce_bank_dev, attr); 2108 2057 } 2109 2058 2110 2059 static ssize_t show_bank(struct device *s, struct device_attribute *attr, 2111 2060 char *buf) 2112 2061 { 2113 - return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 2062 + u8 bank = attr_to_bank(attr)->bank; 2063 + struct mce_bank *b; 2064 + 2065 + if (bank >= per_cpu(mce_num_banks, s->id)) 2066 + return -EINVAL; 2067 + 2068 + b = &per_cpu(mce_banks_array, s->id)[bank]; 2069 + 2070 + if (!b->init) 2071 + return -ENODEV; 2072 + 2073 + return sprintf(buf, "%llx\n", b->ctl); 2114 2074 } 2115 2075 2116 2076 static ssize_t set_bank(struct device *s, struct device_attribute *attr, 2117 2077 const char *buf, size_t size) 2118 2078 { 2079 + u8 bank = attr_to_bank(attr)->bank; 2080 + struct mce_bank *b; 2119 2081 u64 new; 2120 2082 2121 2083 if (kstrtou64(buf, 0, &new) < 0) 2122 2084 return -EINVAL; 2123 2085 2124 - attr_to_bank(attr)->ctl = new; 2086 + if (bank >= per_cpu(mce_num_banks, s->id)) 2087 + return -EINVAL; 2088 + 2089 + b = &per_cpu(mce_banks_array, s->id)[bank]; 2090 + 2091 + if (!b->init) 2092 + return -ENODEV; 2093 + 2094 + b->ctl = new; 2125 2095 mce_restart(); 2126 2096 2127 2097 return size; ··· 2257 2185 kfree(dev); 2258 2186 } 2259 2187 2260 - /* Per cpu device init. All of the cpus still share the same ctrl bank: */ 2188 + /* Per CPU device init. All of the CPUs still share the same bank device: */ 2261 2189 static int mce_device_create(unsigned int cpu) 2262 2190 { 2263 2191 struct device *dev; ··· 2289 2217 if (err) 2290 2218 goto error; 2291 2219 } 2292 - for (j = 0; j < mca_cfg.banks; j++) { 2293 - err = device_create_file(dev, &mce_banks[j].attr); 2220 + for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) { 2221 + err = device_create_file(dev, &mce_bank_devs[j].attr); 2294 2222 if (err) 2295 2223 goto error2; 2296 2224 } ··· 2300 2228 return 0; 2301 2229 error2: 2302 2230 while (--j >= 0) 2303 - device_remove_file(dev, &mce_banks[j].attr); 2231 + device_remove_file(dev, &mce_bank_devs[j].attr); 2304 2232 error: 2305 2233 while (--i >= 0) 2306 2234 device_remove_file(dev, mce_device_attrs[i]); ··· 2321 2249 for (i = 0; mce_device_attrs[i]; i++) 2322 2250 device_remove_file(dev, mce_device_attrs[i]); 2323 2251 2324 - for (i = 0; i < mca_cfg.banks; i++) 2325 - device_remove_file(dev, &mce_banks[i].attr); 2252 + for (i = 0; i < per_cpu(mce_num_banks, cpu); i++) 2253 + device_remove_file(dev, &mce_bank_devs[i].attr); 2326 2254 2327 2255 device_unregister(dev); 2328 2256 cpumask_clear_cpu(cpu, mce_device_initialized); ··· 2343 2271 2344 2272 static void mce_reenable_cpu(void) 2345 2273 { 2274 + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); 2346 2275 int i; 2347 2276 2348 2277 if (!mce_available(raw_cpu_ptr(&cpu_info))) ··· 2351 2278 2352 2279 if (!cpuhp_tasks_frozen) 2353 2280 cmci_reenable(); 2354 - for (i = 0; i < mca_cfg.banks; i++) { 2281 + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { 2355 2282 struct mce_bank *b = &mce_banks[i]; 2356 2283 2357 2284 if (b->init) ··· 2401 2328 { 2402 2329 int i; 2403 2330 2404 - for (i = 0; i < mca_cfg.banks; i++) { 2405 - struct mce_bank *b = &mce_banks[i]; 2331 + for (i = 0; i < MAX_NR_BANKS; i++) { 2332 + struct mce_bank_dev *b = &mce_bank_devs[i]; 2406 2333 struct device_attribute *a = &b->attr; 2334 + 2335 + b->bank = i; 2407 2336 2408 2337 sysfs_attr_init(&a->attr); 2409 2338 a->attr.name = b->attrname; ··· 2516 2441 DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set, 2517 2442 "%llu\n"); 2518 2443 2519 - static int __init mcheck_debugfs_init(void) 2444 + static void __init mcheck_debugfs_init(void) 2520 2445 { 2521 - struct dentry *dmce, *ffake_panic; 2446 + struct dentry *dmce; 2522 2447 2523 2448 dmce = mce_get_debugfs_dir(); 2524 - if (!dmce) 2525 - return -ENOMEM; 2526 - ffake_panic = debugfs_create_file_unsafe("fake_panic", 0444, dmce, 2527 - NULL, &fake_panic_fops); 2528 - if (!ffake_panic) 2529 - return -ENOMEM; 2530 - 2531 - return 0; 2449 + debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL, 2450 + &fake_panic_fops); 2532 2451 } 2533 2452 #else 2534 - static int __init mcheck_debugfs_init(void) { return -EINVAL; } 2453 + static void __init mcheck_debugfs_init(void) { } 2535 2454 #endif 2536 2455 2537 2456 DEFINE_STATIC_KEY_FALSE(mcsafe_key); ··· 2533 2464 2534 2465 static int __init mcheck_late_init(void) 2535 2466 { 2536 - pr_info("Using %d MCE banks\n", mca_cfg.banks); 2537 - 2538 2467 if (mca_cfg.recovery) 2539 2468 static_branch_inc(&mcsafe_key); 2540 2469
+5 -32
arch/x86/kernel/cpu/mce/inject.c
··· 645 645 646 646 static struct dfs_node { 647 647 char *name; 648 - struct dentry *d; 649 648 const struct file_operations *fops; 650 649 umode_t perm; 651 650 } dfs_fls[] = { ··· 658 659 { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH }, 659 660 }; 660 661 661 - static int __init debugfs_init(void) 662 + static void __init debugfs_init(void) 662 663 { 663 664 unsigned int i; 664 665 665 666 dfs_inj = debugfs_create_dir("mce-inject", NULL); 666 - if (!dfs_inj) 667 - return -EINVAL; 668 667 669 - for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) { 670 - dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name, 671 - dfs_fls[i].perm, 672 - dfs_inj, 673 - &i_mce, 674 - dfs_fls[i].fops); 675 - 676 - if (!dfs_fls[i].d) 677 - goto err_dfs_add; 678 - } 679 - 680 - return 0; 681 - 682 - err_dfs_add: 683 - while (i-- > 0) 684 - debugfs_remove(dfs_fls[i].d); 685 - 686 - debugfs_remove(dfs_inj); 687 - dfs_inj = NULL; 688 - 689 - return -ENODEV; 668 + for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) 669 + debugfs_create_file(dfs_fls[i].name, dfs_fls[i].perm, dfs_inj, 670 + &i_mce, dfs_fls[i].fops); 690 671 } 691 672 692 673 static int __init inject_init(void) 693 674 { 694 - int err; 695 - 696 675 if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) 697 676 return -ENOMEM; 698 677 699 - err = debugfs_init(); 700 - if (err) { 701 - free_cpumask_var(mce_inject_cpumask); 702 - return err; 703 - } 678 + debugfs_init(); 704 679 705 680 register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); 706 681 mce_register_injector_chain(&inject_nb);
+1 -11
arch/x86/kernel/cpu/mce/internal.h
··· 22 22 23 23 extern struct blocking_notifier_head x86_mce_decoder_chain; 24 24 25 - #define ATTR_LEN 16 26 25 #define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ 27 - 28 - /* One object for each MCE bank, shared by all CPUs */ 29 - struct mce_bank { 30 - u64 ctl; /* subevents to enable */ 31 - unsigned char init; /* initialise bank? */ 32 - struct device_attribute attr; /* device attribute */ 33 - char attrname[ATTR_LEN]; /* attribute name */ 34 - }; 35 26 36 27 struct mce_evt_llist { 37 28 struct llist_node llnode; ··· 38 47 extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); 39 48 struct dentry *mce_get_debugfs_dir(void); 40 49 41 - extern struct mce_bank *mce_banks; 42 50 extern mce_banks_t mce_banks_ce_disabled; 43 51 44 52 #ifdef CONFIG_X86_MCE_INTEL ··· 118 128 bios_cmci_threshold : 1, 119 129 __reserved : 59; 120 130 121 - u8 banks; 122 131 s8 bootlog; 123 132 int tolerant; 124 133 int monarch_timeout; ··· 126 137 }; 127 138 128 139 extern struct mca_config mca_cfg; 140 + DECLARE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks); 129 141 130 142 struct mce_vendor_flags { 131 143 /*
+3 -11
arch/x86/kernel/cpu/mce/severity.c
··· 400 400 401 401 static int __init severities_debugfs_init(void) 402 402 { 403 - struct dentry *dmce, *fsev; 403 + struct dentry *dmce; 404 404 405 405 dmce = mce_get_debugfs_dir(); 406 - if (!dmce) 407 - goto err_out; 408 406 409 - fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL, 410 - &severities_coverage_fops); 411 - if (!fsev) 412 - goto err_out; 413 - 407 + debugfs_create_file("severities-coverage", 0444, dmce, NULL, 408 + &severities_coverage_fops); 414 409 return 0; 415 - 416 - err_out: 417 - return -ENOMEM; 418 410 } 419 411 late_initcall(severities_debugfs_init); 420 412 #endif /* CONFIG_DEBUG_FS */
+10
arch/x86/ras/Kconfig
··· 11 11 12 12 Bear in mind that this is absolutely useless if your platform doesn't 13 13 have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS. 14 + 15 + config RAS_CEC_DEBUG 16 + bool "CEC debugging machinery" 17 + default n 18 + depends on RAS_CEC 19 + help 20 + Add extra files to (debugfs)/ras/cec to test the correctable error 21 + collector feature. "pfn" is a writable file that allows user to 22 + simulate an error in a particular page frame. "array" is a read-only 23 + file that dumps out the current state of all pages logged so far.
+81 -51
drivers/ras/cec.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (c) 2017-2019 Borislav Petkov, SUSE Labs. 4 + */ 2 5 #include <linux/mm.h> 3 6 #include <linux/gfp.h> 4 7 #include <linux/kernel.h> ··· 40 37 * thus emulate an an LRU-like behavior when deleting elements to free up space 41 38 * in the page. 42 39 * 43 - * When an element reaches it's max count of count_threshold, we try to poison 44 - * it by assuming that errors triggered count_threshold times in a single page 45 - * are excessive and that page shouldn't be used anymore. count_threshold is 40 + * When an element reaches it's max count of action_threshold, we try to poison 41 + * it by assuming that errors triggered action_threshold times in a single page 42 + * are excessive and that page shouldn't be used anymore. action_threshold is 46 43 * initialized to COUNT_MASK which is the maximum. 47 44 * 48 45 * That error event entry causes cec_add_elem() to return !0 value and thus ··· 125 122 static u64 dfs_pfn; 126 123 127 124 /* Amount of errors after which we offline */ 128 - static unsigned int count_threshold = COUNT_MASK; 125 + static u64 action_threshold = COUNT_MASK; 129 126 130 127 /* Each element "decays" each decay_interval which is 24hrs by default. */ 131 128 #define CEC_DECAY_DEFAULT_INTERVAL 24 * 60 * 60 /* 24 hrs */ ··· 279 276 return pfn; 280 277 } 281 278 279 + static bool sanity_check(struct ce_array *ca) 280 + { 281 + bool ret = false; 282 + u64 prev = 0; 283 + int i; 284 + 285 + for (i = 0; i < ca->n; i++) { 286 + u64 this = PFN(ca->array[i]); 287 + 288 + if (WARN(prev > this, "prev: 0x%016llx <-> this: 0x%016llx\n", prev, this)) 289 + ret = true; 290 + 291 + prev = this; 292 + } 293 + 294 + if (!ret) 295 + return ret; 296 + 297 + pr_info("Sanity check dump:\n{ n: %d\n", ca->n); 298 + for (i = 0; i < ca->n; i++) { 299 + u64 this = PFN(ca->array[i]); 300 + 301 + pr_info(" %03d: [%016llx|%03llx]\n", i, this, FULL_COUNT(ca->array[i])); 302 + } 303 + pr_info("}\n"); 304 + 305 + return ret; 306 + } 282 307 283 308 int cec_add_elem(u64 pfn) 284 309 { 285 310 struct ce_array *ca = &ce_arr; 286 - unsigned int to; 311 + unsigned int to = 0; 287 312 int count, ret = 0; 288 313 289 314 /* ··· 325 294 326 295 ca->ces_entered++; 327 296 297 + /* Array full, free the LRU slot. */ 328 298 if (ca->n == MAX_ELEMS) 329 299 WARN_ON(!del_lru_elem_unlocked(ca)); 330 300 ··· 338 306 (void *)&ca->array[to], 339 307 (ca->n - to) * sizeof(u64)); 340 308 341 - ca->array[to] = (pfn << PAGE_SHIFT) | 342 - (DECAY_MASK << COUNT_BITS) | 1; 343 - 309 + ca->array[to] = pfn << PAGE_SHIFT; 344 310 ca->n++; 345 - 346 - ret = 0; 347 - 348 - goto decay; 349 311 } 350 312 313 + /* Add/refresh element generation and increment count */ 314 + ca->array[to] |= DECAY_MASK << COUNT_BITS; 315 + ca->array[to]++; 316 + 317 + /* Check action threshold and soft-offline, if reached. */ 351 318 count = COUNT(ca->array[to]); 352 - 353 - if (count < count_threshold) { 354 - ca->array[to] |= (DECAY_MASK << COUNT_BITS); 355 - ca->array[to]++; 356 - 357 - ret = 0; 358 - } else { 319 + if (count >= action_threshold) { 359 320 u64 pfn = ca->array[to] >> PAGE_SHIFT; 360 321 361 322 if (!pfn_valid(pfn)) { ··· 363 338 del_elem(ca, to); 364 339 365 340 /* 366 - * Return a >0 value to denote that we've reached the offlining 367 - * threshold. 341 + * Return a >0 value to callers, to denote that we've reached 342 + * the offlining threshold. 368 343 */ 369 344 ret = 1; 370 345 371 346 goto unlock; 372 347 } 373 348 374 - decay: 375 349 ca->decay_count++; 376 350 377 351 if (ca->decay_count >= CLEAN_ELEMS) 378 352 do_spring_cleaning(ca); 353 + 354 + WARN_ON_ONCE(sanity_check(ca)); 379 355 380 356 unlock: 381 357 mutex_unlock(&ce_mutex); ··· 395 369 { 396 370 *(u64 *)data = val; 397 371 398 - return cec_add_elem(val); 372 + cec_add_elem(val); 373 + 374 + return 0; 399 375 } 400 376 401 377 DEFINE_DEBUGFS_ATTRIBUTE(pfn_ops, u64_get, pfn_set, "0x%llx\n"); 402 378 403 379 static int decay_interval_set(void *data, u64 val) 404 380 { 405 - *(u64 *)data = val; 406 - 407 381 if (val < CEC_DECAY_MIN_INTERVAL) 408 382 return -EINVAL; 409 383 410 384 if (val > CEC_DECAY_MAX_INTERVAL) 411 385 return -EINVAL; 412 386 387 + *(u64 *)data = val; 413 388 decay_interval = val; 414 389 415 390 cec_mod_work(decay_interval); 391 + 416 392 return 0; 417 393 } 418 394 DEFINE_DEBUGFS_ATTRIBUTE(decay_interval_ops, u64_get, decay_interval_set, "%lld\n"); 419 395 420 - static int count_threshold_set(void *data, u64 val) 396 + static int action_threshold_set(void *data, u64 val) 421 397 { 422 398 *(u64 *)data = val; 423 399 424 400 if (val > COUNT_MASK) 425 401 val = COUNT_MASK; 426 402 427 - count_threshold = val; 403 + action_threshold = val; 428 404 429 405 return 0; 430 406 } 431 - DEFINE_DEBUGFS_ATTRIBUTE(count_threshold_ops, u64_get, count_threshold_set, "%lld\n"); 407 + DEFINE_DEBUGFS_ATTRIBUTE(action_threshold_ops, u64_get, action_threshold_set, "%lld\n"); 408 + 409 + static const char * const bins[] = { "00", "01", "10", "11" }; 432 410 433 411 static int array_dump(struct seq_file *m, void *v) 434 412 { 435 413 struct ce_array *ca = &ce_arr; 436 - u64 prev = 0; 437 414 int i; 438 415 439 416 mutex_lock(&ce_mutex); ··· 445 416 for (i = 0; i < ca->n; i++) { 446 417 u64 this = PFN(ca->array[i]); 447 418 448 - seq_printf(m, " %03d: [%016llx|%03llx]\n", i, this, FULL_COUNT(ca->array[i])); 449 - 450 - WARN_ON(prev > this); 451 - 452 - prev = this; 419 + seq_printf(m, " %3d: [%016llx|%s|%03llx]\n", 420 + i, this, bins[DECAY(ca->array[i])], COUNT(ca->array[i])); 453 421 } 454 422 455 423 seq_printf(m, "}\n"); ··· 459 433 seq_printf(m, "Decay interval: %lld seconds\n", decay_interval); 460 434 seq_printf(m, "Decays: %lld\n", ca->decays_done); 461 435 462 - seq_printf(m, "Action threshold: %d\n", count_threshold); 436 + seq_printf(m, "Action threshold: %lld\n", action_threshold); 463 437 464 438 mutex_unlock(&ce_mutex); 465 439 ··· 489 463 return -1; 490 464 } 491 465 466 + decay = debugfs_create_file("decay_interval", S_IRUSR | S_IWUSR, d, 467 + &decay_interval, &decay_interval_ops); 468 + if (!decay) { 469 + pr_warn("Error creating decay_interval debugfs node!\n"); 470 + goto err; 471 + } 472 + 473 + count = debugfs_create_file("action_threshold", S_IRUSR | S_IWUSR, d, 474 + &action_threshold, &action_threshold_ops); 475 + if (!count) { 476 + pr_warn("Error creating action_threshold debugfs node!\n"); 477 + goto err; 478 + } 479 + 480 + if (!IS_ENABLED(CONFIG_RAS_CEC_DEBUG)) 481 + return 0; 482 + 492 483 pfn = debugfs_create_file("pfn", S_IRUSR | S_IWUSR, d, &dfs_pfn, &pfn_ops); 493 484 if (!pfn) { 494 485 pr_warn("Error creating pfn debugfs node!\n"); ··· 517 474 pr_warn("Error creating array debugfs node!\n"); 518 475 goto err; 519 476 } 520 - 521 - decay = debugfs_create_file("decay_interval", S_IRUSR | S_IWUSR, d, 522 - &decay_interval, &decay_interval_ops); 523 - if (!decay) { 524 - pr_warn("Error creating decay_interval debugfs node!\n"); 525 - goto err; 526 - } 527 - 528 - count = debugfs_create_file("count_threshold", S_IRUSR | S_IWUSR, d, 529 - &count_threshold, &count_threshold_ops); 530 - if (!count) { 531 - pr_warn("Error creating count_threshold debugfs node!\n"); 532 - goto err; 533 - } 534 - 535 477 536 478 return 0; 537 479 ··· 537 509 return; 538 510 } 539 511 540 - if (create_debugfs_nodes()) 512 + if (create_debugfs_nodes()) { 513 + free_page((unsigned long)ce_arr.array); 541 514 return; 515 + } 542 516 543 517 INIT_DELAYED_WORK(&cec_work, cec_work_fn); 544 518 schedule_delayed_work(&cec_work, CEC_DECAY_DEFAULT_INTERVAL);