powerpc/powernv: Dump PHB diag-data immediately

The PHB diag-data is important to help locating the root cause for
EEH errors such as frozen PE or fenced PHB. However, the EEH core
enables IO path by clearing part of HW registers before collecting
this data causing it to be corrupted.

This patch fixes this by dumping the PHB diag-data immediately when
frozen/fenced state on PE or PHB is detected for the first time in
eeh_ops::get_state() or next_error() backend.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
CC: <stable@vger.kernel.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

authored by

Gavin Shan and committed by

Benjamin Herrenschmidt 12 years ago 94716604 573ebfa6

+43 -53

1 changed file

expand all

arch

powerpc

platforms

powernv

eeh-ioda.c

+43 -53

arch/powerpc/platforms/powernv/eeh-ioda.c

··· 114 114 ioda_eeh_inbB_dbgfs_set, "0x%llx\n"); 115 115 #endif /* CONFIG_DEBUG_FS */ 116 116 117 + 117 118 /** 118 119 * ioda_eeh_post_init - Chip dependent post initialization 119 120 * @hose: PCI controller ··· 222 221 return ret; 223 222 } 224 223 224 + static void ioda_eeh_phb_diag(struct pci_controller *hose) 225 + { 226 + struct pnv_phb *phb = hose->private_data; 227 + long rc; 228 + 229 + rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob, 230 + PNV_PCI_DIAG_BUF_SIZE); 231 + if (rc != OPAL_SUCCESS) { 232 + pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n", 233 + __func__, hose->global_number, rc); 234 + return; 235 + } 236 + 237 + pnv_pci_dump_phb_diag_data(hose, phb->diag.blob); 238 + } 239 + 225 240 /** 226 241 * ioda_eeh_get_state - Retrieve the state of PE 227 242 * @pe: EEH PE ··· 289 272 result |= EEH_STATE_DMA_ACTIVE; 290 273 result |= EEH_STATE_MMIO_ENABLED; 291 274 result |= EEH_STATE_DMA_ENABLED; 275 + } else if (!(pe->state & EEH_PE_ISOLATED)) { 276 + eeh_pe_state_mark(pe, EEH_PE_ISOLATED); 277 + ioda_eeh_phb_diag(hose); 292 278 } 293 279 294 280 return result; ··· 333 313 pr_warning("%s: Unexpected EEH status 0x%x " 334 314 "on PHB#%x-PE#%x\n", 335 315 __func__, fstate, hose->global_number, pe_no); 316 + } 317 + 318 + /* Dump PHB diag-data for frozen PE */ 319 + if (result != EEH_STATE_NOT_SUPPORT && 320 + (result & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) != 321 + (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE) && 322 + !(pe->state & EEH_PE_ISOLATED)) { 323 + eeh_pe_state_mark(pe, EEH_PE_ISOLATED); 324 + ioda_eeh_phb_diag(hose); 336 325 } 337 326 338 327 return result; ··· 559 530 } 560 531 561 532 /** 562 - * ioda_eeh_get_log - Retrieve error log 563 - * @pe: EEH PE 564 - * @severity: Severity level of the log 565 - * @drv_log: buffer to store the log 566 - * @len: space of the log buffer 567 - * 568 - * The function is used to retrieve error log from P7IOC. 569 - */ 570 - static int ioda_eeh_get_log(struct eeh_pe *pe, int severity, 571 - char *drv_log, unsigned long len) 572 - { 573 - s64 ret; 574 - unsigned long flags; 575 - struct pci_controller *hose = pe->phb; 576 - struct pnv_phb *phb = hose->private_data; 577 - 578 - spin_lock_irqsave(&phb->lock, flags); 579 - 580 - ret = opal_pci_get_phb_diag_data2(phb->opal_id, 581 - phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE); 582 - if (ret) { 583 - spin_unlock_irqrestore(&phb->lock, flags); 584 - pr_warning("%s: Can't get log for PHB#%x-PE#%x (%lld)\n", 585 - __func__, hose->global_number, pe->addr, ret); 586 - return -EIO; 587 - } 588 - 589 - /* The PHB diag-data is always indicative */ 590 - pnv_pci_dump_phb_diag_data(hose, phb->diag.blob); 591 - 592 - spin_unlock_irqrestore(&phb->lock, flags); 593 - 594 - return 0; 595 - } 596 - 597 - /** 598 533 * ioda_eeh_configure_bridge - Configure the PCI bridges for the indicated PE 599 534 * @pe: EEH PE 600 535 * ··· 637 644 pr_warning("%s: Invalid type of HUB#%llx diag-data (%d)\n", 638 645 __func__, phb->hub_id, data->type); 639 646 } 640 - } 641 - 642 - static void ioda_eeh_phb_diag(struct pci_controller *hose) 643 - { 644 - struct pnv_phb *phb = hose->private_data; 645 - long rc; 646 - 647 - rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob, 648 - PNV_PCI_DIAG_BUF_SIZE); 649 - if (rc != OPAL_SUCCESS) { 650 - pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n", 651 - __func__, hose->global_number, rc); 652 - return; 653 - } 654 - 655 - pnv_pci_dump_phb_diag_data(hose, phb->diag.blob); 656 647 } 657 648 658 649 static int ioda_eeh_get_phb_pe(struct pci_controller *hose, ··· 812 835 } 813 836 814 837 /* 838 + * EEH core will try recover from fenced PHB or 839 + * frozen PE. In the time for frozen PE, EEH core 840 + * enable IO path for that before collecting logs, 841 + * but it ruins the site. So we have to dump the 842 + * log in advance here. 843 + */ 844 + if ((ret == EEH_NEXT_ERR_FROZEN_PE || 845 + ret == EEH_NEXT_ERR_FENCED_PHB) && 846 + !((*pe)->state & EEH_PE_ISOLATED)) { 847 + eeh_pe_state_mark(*pe, EEH_PE_ISOLATED); 848 + ioda_eeh_phb_diag(hose); 849 + } 850 + 851 + /* 815 852 * If we have no errors on the specific PHB or only 816 853 * informative error there, we continue poking it. 817 854 * Otherwise, we need actions to be taken by upper ··· 843 852 .set_option = ioda_eeh_set_option, 844 853 .get_state = ioda_eeh_get_state, 845 854 .reset = ioda_eeh_reset, 846 - .get_log = ioda_eeh_get_log, 847 855 .configure_bridge = ioda_eeh_configure_bridge, 848 856 .next_error = ioda_eeh_next_error 849 857 };

Configure Feed

Configure Feed