Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

powerpc/pseries: Defer the logging of rtas error to irq work queue.

rtas_log_buf is a buffer to hold RTAS event data that are communicated
to kernel by hypervisor. This buffer is then used to pass RTAS event
data to user through proc fs. This buffer is allocated from
vmalloc (non-linear mapping) area.

On Machine check interrupt, register r3 points to RTAS extended event
log passed by hypervisor that contains the MCE event. The pseries
machine check handler then logs this error into rtas_log_buf. The
rtas_log_buf is a vmalloc-ed (non-linear) buffer we end up taking up a
page fault (vector 0x300) while accessing it. Since machine check
interrupt handler runs in NMI context we can not afford to take any
page fault. Page faults are not honored in NMI context and causes
kernel panic. Apart from that, as Nick pointed out,
pSeries_log_error() also takes a spin_lock while logging error which
is not safe in NMI context. It may endup in deadlock if we get another
MCE before releasing the lock. Fix this by deferring the logging of
rtas error to irq work queue.

Current implementation uses two different buffers to hold rtas error
log depending on whether extended log is provided or not. This makes
bit difficult to identify which buffer has valid data that needs to
logged later in irq work. Simplify this using single buffer, one per
paca, and copy rtas log to it irrespective of whether extended log is
provided or not. Allocate this buffer below RMA region so that it can
be accessed in real mode mce handler.

Fixes: b96672dd840f ("powerpc: Machine check interrupt is a non-maskable interrupt")
Cc: stable@vger.kernel.org # v4.14+
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

authored by

Mahesh Salgaonkar and committed by
Michael Ellerman
94675cce 74e96bf4

+51 -15
+3
arch/powerpc/include/asm/paca.h
··· 247 247 void *rfi_flush_fallback_area; 248 248 u64 l1d_flush_size; 249 249 #endif 250 + #ifdef CONFIG_PPC_PSERIES 251 + u8 *mce_data_buf; /* buffer to hold per cpu rtas errlog */ 252 + #endif /* CONFIG_PPC_PSERIES */ 250 253 } ____cacheline_aligned; 251 254 252 255 extern void copy_mm_to_paca(struct mm_struct *mm);
+32 -15
arch/powerpc/platforms/pseries/ras.c
··· 22 22 #include <linux/of.h> 23 23 #include <linux/fs.h> 24 24 #include <linux/reboot.h> 25 + #include <linux/irq_work.h> 25 26 26 27 #include <asm/machdep.h> 27 28 #include <asm/rtas.h> ··· 33 32 static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; 34 33 static DEFINE_SPINLOCK(ras_log_buf_lock); 35 34 36 - static char global_mce_data_buf[RTAS_ERROR_LOG_MAX]; 37 - static DEFINE_PER_CPU(__u64, mce_data_buf); 38 - 39 35 static int ras_check_exception_token; 36 + 37 + static void mce_process_errlog_event(struct irq_work *work); 38 + static struct irq_work mce_errlog_process_work = { 39 + .func = mce_process_errlog_event, 40 + }; 40 41 41 42 #define EPOW_SENSOR_TOKEN 9 42 43 #define EPOW_SENSOR_INDEX 0 ··· 333 330 ((((A) >= 0x7000) && ((A) < 0x7ff0)) || \ 334 331 (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16)))) 335 332 333 + static inline struct rtas_error_log *fwnmi_get_errlog(void) 334 + { 335 + return (struct rtas_error_log *)local_paca->mce_data_buf; 336 + } 337 + 336 338 /* 337 339 * Get the error information for errors coming through the 338 340 * FWNMI vectors. The pt_regs' r3 will be updated to reflect 339 341 * the actual r3 if possible, and a ptr to the error log entry 340 342 * will be returned if found. 341 343 * 342 - * If the RTAS error is not of the extended type, then we put it in a per 343 - * cpu 64bit buffer. If it is the extended type we use global_mce_data_buf. 344 + * Use one buffer mce_data_buf per cpu to store RTAS error. 344 345 * 345 - * The global_mce_data_buf does not have any locks or protection around it, 346 + * The mce_data_buf does not have any locks or protection around it, 346 347 * if a second machine check comes in, or a system reset is done 347 348 * before we have logged the error, then we will get corruption in the 348 349 * error log. This is preferable over holding off on calling ··· 356 349 static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) 357 350 { 358 351 unsigned long *savep; 359 - struct rtas_error_log *h, *errhdr = NULL; 352 + struct rtas_error_log *h; 360 353 361 354 /* Mask top two bits */ 362 355 regs->gpr[3] &= ~(0x3UL << 62); ··· 369 362 savep = __va(regs->gpr[3]); 370 363 regs->gpr[3] = savep[0]; /* restore original r3 */ 371 364 372 - /* If it isn't an extended log we can use the per cpu 64bit buffer */ 373 365 h = (struct rtas_error_log *)&savep[1]; 366 + /* Use the per cpu buffer from paca to store rtas error log */ 367 + memset(local_paca->mce_data_buf, 0, RTAS_ERROR_LOG_MAX); 374 368 if (!rtas_error_extended(h)) { 375 - memcpy(this_cpu_ptr(&mce_data_buf), h, sizeof(__u64)); 376 - errhdr = (struct rtas_error_log *)this_cpu_ptr(&mce_data_buf); 369 + memcpy(local_paca->mce_data_buf, h, sizeof(__u64)); 377 370 } else { 378 371 int len, error_log_length; 379 372 380 373 error_log_length = 8 + rtas_error_extended_log_length(h); 381 374 len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX); 382 - memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX); 383 - memcpy(global_mce_data_buf, h, len); 384 - errhdr = (struct rtas_error_log *)global_mce_data_buf; 375 + memcpy(local_paca->mce_data_buf, h, len); 385 376 } 386 377 387 - return errhdr; 378 + return (struct rtas_error_log *)local_paca->mce_data_buf; 388 379 } 389 380 390 381 /* Call this when done with the data returned by FWNMI_get_errinfo. ··· 425 420 return 1; 426 421 427 422 return 0; /* need to perform reset */ 423 + } 424 + 425 + /* 426 + * Process MCE rtas errlog event. 427 + */ 428 + static void mce_process_errlog_event(struct irq_work *work) 429 + { 430 + struct rtas_error_log *err; 431 + 432 + err = fwnmi_get_errlog(); 433 + log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); 428 434 } 429 435 430 436 /* ··· 482 466 recovered = 1; 483 467 } 484 468 485 - log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); 469 + /* Queue irq work to log this rtas event later. */ 470 + irq_work_queue(&mce_errlog_process_work); 486 471 487 472 return recovered; 488 473 }
+16
arch/powerpc/platforms/pseries/setup.c
··· 41 41 #include <linux/root_dev.h> 42 42 #include <linux/of.h> 43 43 #include <linux/of_pci.h> 44 + #include <linux/memblock.h> 44 45 45 46 #include <asm/mmu.h> 46 47 #include <asm/processor.h> ··· 103 102 static void __init fwnmi_init(void) 104 103 { 105 104 unsigned long system_reset_addr, machine_check_addr; 105 + u8 *mce_data_buf; 106 + unsigned int i; 107 + int nr_cpus = num_possible_cpus(); 106 108 107 109 int ibm_nmi_register = rtas_token("ibm,nmi-register"); 108 110 if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE) ··· 119 115 if (0 == rtas_call(ibm_nmi_register, 2, 1, NULL, system_reset_addr, 120 116 machine_check_addr)) 121 117 fwnmi_active = 1; 118 + 119 + /* 120 + * Allocate a chunk for per cpu buffer to hold rtas errorlog. 121 + * It will be used in real mode mce handler, hence it needs to be 122 + * below RMA. 123 + */ 124 + mce_data_buf = __va(memblock_alloc_base(RTAS_ERROR_LOG_MAX * nr_cpus, 125 + RTAS_ERROR_LOG_MAX, ppc64_rma_size)); 126 + for_each_possible_cpu(i) { 127 + paca_ptrs[i]->mce_data_buf = mce_data_buf + 128 + (RTAS_ERROR_LOG_MAX * i); 129 + } 122 130 } 123 131 124 132 static void pseries_8259_cascade(struct irq_desc *desc)