Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'eth-fbnic-add-devlink-health-support-for-fw-crashes-and-otp-mem-corruptions'

Jakub Kicinski says:

====================
eth: fbnic: add devlink health support for FW crashes and OTP mem corruptions

Add support for FW crash detection and a corresponding devlink health
reporter. Add a reporter for checking OTP memory health.

The output is not particularly exciting:

# devlink health show
pci/0000:01:00.0:
reporter fw
state healthy error 0 recover 0 auto_dump true
reporter otp
state healthy error 0 recover 0 auto_dump true
# devlink health diagnose pci/0000:01:00.0 reporter fw
FW uptime: 0
# devlink health dump show pci/0000:01:00.0 reporter fw
FW coredump:
5a 45 01 00 04 00 06 00 00 00 00 00 4d 01 00 d0
.. lots of hex follows ..
# devlink health dump show pci/0000:01:00.0 reporter otp
OTP:
Status: 0 Data: 0 ECC: 0

v2: https://lore.kernel.org/20250915155312.1083292-1-kuba@kernel.org
v1: https://lore.kernel.org/20250912201428.566190-1-kuba@kernel.org
====================

Link: https://patch.msgid.link/20250916231420.1693955-1-kuba@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

+659 -28
+19
Documentation/networking/device_drivers/ethernet/meta/fbnic.rst
··· 69 69 is required. Firmware activation is required to run new control firmware. cmrt 70 70 firmware can only be activated by power cycling the NIC. 71 71 72 + Health reporters 73 + ---------------- 74 + 75 + fw reporter 76 + ~~~~~~~~~~~ 77 + 78 + The ``fw`` health reporter tracks FW crashes. Dumping the reporter will 79 + show the core dump of the most recent FW crash, and if no FW crash has 80 + happened since power cycle - a snapshot of the FW memory. Diagnose callback 81 + shows FW uptime based on the most recently received heartbeat message 82 + (the crashes are detected by checking if uptime goes down). 83 + 84 + otp reporter 85 + ~~~~~~~~~~~~ 86 + 87 + OTP memory ("fuses") are used for secure boot and anti-rollback 88 + protection. The OTP memory is ECC protected, ECC errors indicate 89 + either manufacturing defect or part deteriorating with age. 90 + 72 91 Statistics 73 92 ---------- 74 93
+13
drivers/net/ethernet/meta/fbnic/fbnic.h
··· 27 27 struct net_device *netdev; 28 28 struct dentry *dbg_fbd; 29 29 struct device *hwmon; 30 + struct devlink_health_reporter *fw_reporter; 31 + struct devlink_health_reporter *otp_reporter; 30 32 31 33 u32 __iomem *uc_addr0; 32 34 u32 __iomem *uc_addr4; ··· 85 83 86 84 /* Local copy of hardware statistics */ 87 85 struct fbnic_hw_stats hw_stats; 86 + 87 + /* Firmware time since boot in milliseconds */ 88 + u64 firmware_time; 89 + u64 prev_firmware_time; 88 90 89 91 struct fbnic_fw_log fw_log; 90 92 }; ··· 161 155 162 156 void fbnic_devlink_free(struct fbnic_dev *fbd); 163 157 struct fbnic_dev *fbnic_devlink_alloc(struct pci_dev *pdev); 158 + int fbnic_devlink_health_create(struct fbnic_dev *fbd); 159 + void fbnic_devlink_health_destroy(struct fbnic_dev *fbd); 164 160 void fbnic_devlink_register(struct fbnic_dev *fbd); 165 161 void fbnic_devlink_unregister(struct fbnic_dev *fbd); 162 + void __printf(2, 3) 163 + fbnic_devlink_fw_report(struct fbnic_dev *fbd, const char *format, ...); 164 + void fbnic_devlink_otp_check(struct fbnic_dev *fbd, const char *msg); 166 165 167 166 int fbnic_fw_request_mbx(struct fbnic_dev *fbd); 168 167 void fbnic_fw_free_mbx(struct fbnic_dev *fbd); ··· 197 186 void fbnic_dbg_fbd_exit(struct fbnic_dev *fbd); 198 187 void fbnic_dbg_init(void); 199 188 void fbnic_dbg_exit(void); 189 + 190 + void fbnic_rpc_reset_valid_entries(struct fbnic_dev *fbd); 200 191 201 192 void fbnic_csr_get_regs(struct fbnic_dev *fbd, u32 *data, u32 *regs_version); 202 193 int fbnic_csr_regs_len(struct fbnic_dev *fbd);
+18
drivers/net/ethernet/meta/fbnic/fbnic_csr.h
··· 1178 1178 #define FBNIC_IPC_MBX_DESC_FW_CMPL DESC_BIT(1) 1179 1179 #define FBNIC_IPC_MBX_DESC_HOST_CMPL DESC_BIT(0) 1180 1180 1181 + /* OTP Registers 1182 + * These registers are accessible via bar4 offset and are written by CMRT 1183 + * on boot. For the write status, the register is broken up in half with OTP 1184 + * Write Data Status occupying the top 16 bits and the ECC status occupying the 1185 + * bottom 16 bits. 1186 + */ 1187 + #define FBNIC_NS_OTP_STATUS 0x0021d 1188 + #define FBNIC_NS_OTP_WRITE_STATUS 0x0021e 1189 + 1190 + #define FBNIC_NS_OTP_WRITE_DATA_STATUS_MASK CSR_GENMASK(31, 16) 1191 + #define FBNIC_NS_OTP_WRITE_ECC_STATUS_MASK CSR_GENMASK(15, 0) 1192 + 1193 + #define FBNIC_REGS_VERSION CSR_GENMASK(31, 16) 1194 + #define FBNIC_REGS_HW_TYPE CSR_GENMASK(15, 8) 1195 + enum{ 1196 + FBNIC_CSR_VERSION_V1_0_ASIC = 1, 1197 + }; 1198 + 1181 1199 #endif /* _FBNIC_CSR_H_ */
+249
drivers/net/ethernet/meta/fbnic/fbnic_devlink.c
··· 8 8 #include <net/devlink.h> 9 9 10 10 #include "fbnic.h" 11 + #include "fbnic_fw.h" 11 12 #include "fbnic_tlv.h" 12 13 13 14 #define FBNIC_SN_STR_LEN 24 ··· 369 368 .info_get = fbnic_devlink_info_get, 370 369 .flash_update = fbnic_devlink_flash_update, 371 370 }; 371 + 372 + static int fbnic_fw_reporter_dump(struct devlink_health_reporter *reporter, 373 + struct devlink_fmsg *fmsg, void *priv_ctx, 374 + struct netlink_ext_ack *extack) 375 + { 376 + struct fbnic_dev *fbd = devlink_health_reporter_priv(reporter); 377 + u32 offset, index, index_count, length, size; 378 + struct fbnic_fw_completion *fw_cmpl; 379 + u8 *dump_data, **data; 380 + int err; 381 + 382 + fw_cmpl = fbnic_fw_alloc_cmpl(FBNIC_TLV_MSG_ID_COREDUMP_GET_INFO_RESP); 383 + if (!fw_cmpl) 384 + return -ENOMEM; 385 + 386 + err = fbnic_fw_xmit_coredump_info_msg(fbd, fw_cmpl, true); 387 + if (err) { 388 + NL_SET_ERR_MSG_MOD(extack, 389 + "Failed to transmit core dump info msg"); 390 + goto cmpl_free; 391 + } 392 + if (!wait_for_completion_timeout(&fw_cmpl->done, 2 * HZ)) { 393 + NL_SET_ERR_MSG_MOD(extack, 394 + "Timed out waiting on core dump info"); 395 + err = -ETIMEDOUT; 396 + goto cmpl_cleanup; 397 + } 398 + 399 + size = fw_cmpl->u.coredump_info.size; 400 + err = fw_cmpl->result; 401 + 402 + fbnic_mbx_clear_cmpl(fbd, fw_cmpl); 403 + fbnic_fw_put_cmpl(fw_cmpl); 404 + 405 + /* Handle error returned by firmware */ 406 + if (err) { 407 + NL_SET_ERR_MSG_MOD(extack, "Firmware core dump returned error"); 408 + return err; 409 + } 410 + if (!size) { 411 + NL_SET_ERR_MSG_MOD(extack, 412 + "Firmware core dump returned size 0"); 413 + return -EIO; 414 + } 415 + 416 + /* Read the dump, we can only transfer TLV_MAX_DATA at a time */ 417 + index_count = DIV_ROUND_UP(size, TLV_MAX_DATA); 418 + 419 + fw_cmpl = __fbnic_fw_alloc_cmpl(FBNIC_TLV_MSG_ID_COREDUMP_READ_RESP, 420 + sizeof(void *) * index_count + size); 421 + if (!fw_cmpl) 422 + return -ENOMEM; 423 + 424 + /* Populate pointer table w/ pointer offsets */ 425 + dump_data = (void *)&fw_cmpl->u.coredump.data[index_count]; 426 + data = fw_cmpl->u.coredump.data; 427 + fw_cmpl->u.coredump.size = size; 428 + fw_cmpl->u.coredump.stride = TLV_MAX_DATA; 429 + 430 + for (index = 0; index < index_count; index++) { 431 + /* First iteration installs completion */ 432 + struct fbnic_fw_completion *cmpl_arg = index ? NULL : fw_cmpl; 433 + 434 + offset = index * TLV_MAX_DATA; 435 + length = min(size - offset, TLV_MAX_DATA); 436 + 437 + data[index] = dump_data + offset; 438 + err = fbnic_fw_xmit_coredump_read_msg(fbd, cmpl_arg, 439 + offset, length); 440 + if (err) { 441 + NL_SET_ERR_MSG_MOD(extack, 442 + "Failed to transmit core dump msg"); 443 + if (cmpl_arg) 444 + goto cmpl_free; 445 + else 446 + goto cmpl_cleanup; 447 + } 448 + 449 + if (wait_for_completion_timeout(&fw_cmpl->done, 2 * HZ)) { 450 + reinit_completion(&fw_cmpl->done); 451 + } else { 452 + NL_SET_ERR_MSG_FMT_MOD(extack, 453 + "Timed out waiting on core dump (%d/%d)", 454 + index + 1, index_count); 455 + err = -ETIMEDOUT; 456 + goto cmpl_cleanup; 457 + } 458 + 459 + /* If we didn't see the reply record as incomplete */ 460 + if (fw_cmpl->u.coredump.data[index]) { 461 + NL_SET_ERR_MSG_FMT_MOD(extack, 462 + "No data for core dump chunk (%d/%d)", 463 + index + 1, index_count); 464 + err = -EIO; 465 + goto cmpl_cleanup; 466 + } 467 + } 468 + 469 + devlink_fmsg_binary_pair_nest_start(fmsg, "FW coredump"); 470 + 471 + for (offset = 0; offset < size; offset += length) { 472 + length = min_t(u32, size - offset, TLV_MAX_DATA); 473 + 474 + devlink_fmsg_binary_put(fmsg, dump_data + offset, length); 475 + } 476 + 477 + devlink_fmsg_binary_pair_nest_end(fmsg); 478 + 479 + cmpl_cleanup: 480 + fbnic_mbx_clear_cmpl(fbd, fw_cmpl); 481 + cmpl_free: 482 + fbnic_fw_put_cmpl(fw_cmpl); 483 + 484 + return err; 485 + } 486 + 487 + static int 488 + fbnic_fw_reporter_diagnose(struct devlink_health_reporter *reporter, 489 + struct devlink_fmsg *fmsg, 490 + struct netlink_ext_ack *extack) 491 + { 492 + struct fbnic_dev *fbd = devlink_health_reporter_priv(reporter); 493 + u32 sec, msec; 494 + 495 + /* Device is most likely down, we're not exchanging heartbeats */ 496 + if (!fbd->prev_firmware_time) 497 + return 0; 498 + 499 + sec = div_u64_rem(fbd->firmware_time, MSEC_PER_SEC, &msec); 500 + 501 + devlink_fmsg_pair_nest_start(fmsg, "last_heartbeat"); 502 + devlink_fmsg_obj_nest_start(fmsg); 503 + devlink_fmsg_pair_nest_start(fmsg, "fw_uptime"); 504 + devlink_fmsg_obj_nest_start(fmsg); 505 + devlink_fmsg_u32_pair_put(fmsg, "sec", sec); 506 + devlink_fmsg_u32_pair_put(fmsg, "msec", msec); 507 + devlink_fmsg_obj_nest_end(fmsg); 508 + devlink_fmsg_pair_nest_end(fmsg); 509 + devlink_fmsg_obj_nest_end(fmsg); 510 + devlink_fmsg_pair_nest_end(fmsg); 511 + 512 + return 0; 513 + } 514 + 515 + void __printf(2, 3) 516 + fbnic_devlink_fw_report(struct fbnic_dev *fbd, const char *format, ...) 517 + { 518 + char msg[FBNIC_FW_LOG_MAX_SIZE]; 519 + va_list args; 520 + 521 + va_start(args, format); 522 + vsnprintf(msg, FBNIC_FW_LOG_MAX_SIZE, format, args); 523 + va_end(args); 524 + 525 + devlink_health_report(fbd->fw_reporter, msg, fbd); 526 + if (fbnic_fw_log_ready(fbd)) 527 + fbnic_fw_log_write(fbd, 0, fbd->firmware_time, msg); 528 + } 529 + 530 + static const struct devlink_health_reporter_ops fbnic_fw_ops = { 531 + .name = "fw", 532 + .dump = fbnic_fw_reporter_dump, 533 + .diagnose = fbnic_fw_reporter_diagnose, 534 + }; 535 + 536 + static u32 fbnic_read_otp_status(struct fbnic_dev *fbd) 537 + { 538 + return fbnic_fw_rd32(fbd, FBNIC_NS_OTP_STATUS); 539 + } 540 + 541 + static int 542 + fbnic_otp_reporter_dump(struct devlink_health_reporter *reporter, 543 + struct devlink_fmsg *fmsg, void *priv_ctx, 544 + struct netlink_ext_ack *extack) 545 + { 546 + struct fbnic_dev *fbd = devlink_health_reporter_priv(reporter); 547 + u32 otp_status, otp_write_status, m; 548 + 549 + otp_status = fbnic_read_otp_status(fbd); 550 + otp_write_status = fbnic_fw_rd32(fbd, FBNIC_NS_OTP_WRITE_STATUS); 551 + 552 + /* Dump OTP status */ 553 + devlink_fmsg_pair_nest_start(fmsg, "OTP"); 554 + devlink_fmsg_obj_nest_start(fmsg); 555 + 556 + devlink_fmsg_u32_pair_put(fmsg, "Status", otp_status); 557 + 558 + /* Extract OTP Write Data status */ 559 + m = FBNIC_NS_OTP_WRITE_DATA_STATUS_MASK; 560 + devlink_fmsg_u32_pair_put(fmsg, "Data", 561 + FIELD_GET(m, otp_write_status)); 562 + 563 + /* Extract OTP Write ECC status */ 564 + m = FBNIC_NS_OTP_WRITE_ECC_STATUS_MASK; 565 + devlink_fmsg_u32_pair_put(fmsg, "ECC", 566 + FIELD_GET(m, otp_write_status)); 567 + 568 + devlink_fmsg_obj_nest_end(fmsg); 569 + devlink_fmsg_pair_nest_end(fmsg); 570 + 571 + return 0; 572 + } 573 + 574 + void fbnic_devlink_otp_check(struct fbnic_dev *fbd, const char *msg) 575 + { 576 + /* Check if there is anything to report */ 577 + if (!fbnic_read_otp_status(fbd)) 578 + return; 579 + 580 + devlink_health_report(fbd->otp_reporter, msg, fbd); 581 + if (fbnic_fw_log_ready(fbd)) 582 + fbnic_fw_log_write(fbd, 0, fbd->firmware_time, msg); 583 + } 584 + 585 + static const struct devlink_health_reporter_ops fbnic_otp_ops = { 586 + .name = "otp", 587 + .dump = fbnic_otp_reporter_dump, 588 + }; 589 + 590 + int fbnic_devlink_health_create(struct fbnic_dev *fbd) 591 + { 592 + fbd->fw_reporter = devlink_health_reporter_create(priv_to_devlink(fbd), 593 + &fbnic_fw_ops, fbd); 594 + if (IS_ERR(fbd->fw_reporter)) { 595 + dev_warn(fbd->dev, 596 + "Failed to create FW fault reporter: %pe\n", 597 + fbd->fw_reporter); 598 + return PTR_ERR(fbd->fw_reporter); 599 + } 600 + 601 + fbd->otp_reporter = devlink_health_reporter_create(priv_to_devlink(fbd), 602 + &fbnic_otp_ops, fbd); 603 + if (IS_ERR(fbd->otp_reporter)) { 604 + devlink_health_reporter_destroy(fbd->fw_reporter); 605 + dev_warn(fbd->dev, 606 + "Failed to create OTP fault reporter: %pe\n", 607 + fbd->otp_reporter); 608 + return PTR_ERR(fbd->otp_reporter); 609 + } 610 + 611 + return 0; 612 + } 613 + 614 + void fbnic_devlink_health_destroy(struct fbnic_dev *fbd) 615 + { 616 + devlink_health_reporter_destroy(fbd->otp_reporter); 617 + devlink_health_reporter_destroy(fbd->fw_reporter); 618 + } 372 619 373 620 void fbnic_devlink_free(struct fbnic_dev *fbd) 374 621 {
+238 -3
drivers/net/ethernet/meta/fbnic/fbnic_fw.c
··· 495 495 496 496 fbd->last_heartbeat_request = req_time; 497 497 498 + /* Set prev_firmware_time to 0 to avoid triggering firmware crash 499 + * detection until we receive the second uptime in a heartbeat resp. 500 + */ 501 + fbd->prev_firmware_time = 0; 502 + 498 503 /* Set heartbeat detection based on if we are taking ownership */ 499 504 fbd->fw_heartbeat_enabled = take_ownership; 500 505 ··· 665 660 } 666 661 667 662 static const struct fbnic_tlv_index fbnic_ownership_resp_index[] = { 663 + FBNIC_TLV_ATTR_U64(FBNIC_FW_OWNERSHIP_TIME), 668 664 FBNIC_TLV_ATTR_LAST 669 665 }; 670 666 ··· 677 671 /* Count the ownership response as a heartbeat reply */ 678 672 fbd->last_heartbeat_response = jiffies; 679 673 674 + /* Capture firmware time for logging and firmware crash check */ 675 + fbd->firmware_time = fta_get_uint(results, FBNIC_FW_OWNERSHIP_TIME); 676 + 680 677 return 0; 681 678 } 682 679 683 680 static const struct fbnic_tlv_index fbnic_heartbeat_resp_index[] = { 681 + FBNIC_TLV_ATTR_U64(FBNIC_FW_HEARTBEAT_UPTIME), 684 682 FBNIC_TLV_ATTR_LAST 685 683 }; 686 684 ··· 694 684 struct fbnic_dev *fbd = (struct fbnic_dev *)opaque; 695 685 696 686 fbd->last_heartbeat_response = jiffies; 687 + 688 + /* Capture firmware time for logging and firmware crash check */ 689 + fbd->firmware_time = fta_get_uint(results, FBNIC_FW_HEARTBEAT_UPTIME); 697 690 698 691 return 0; 699 692 } ··· 719 706 goto free_message; 720 707 721 708 fbd->last_heartbeat_request = req_time; 709 + fbd->prev_firmware_time = fbd->firmware_time; 722 710 723 711 return err; 724 712 ··· 780 766 return; 781 767 782 768 /* Was the last heartbeat response long time ago? */ 783 - if (!fbnic_fw_heartbeat_current(fbd)) { 769 + if (!fbnic_fw_heartbeat_current(fbd) || 770 + fbd->firmware_time < fbd->prev_firmware_time) { 784 771 dev_warn(fbd->dev, 785 772 "Firmware did not respond to heartbeat message\n"); 786 773 fbd->fw_heartbeat_enabled = false; ··· 791 776 err = fbnic_fw_xmit_heartbeat_message(fbd); 792 777 if (err) 793 778 dev_warn(fbd->dev, "Failed to send heartbeat message\n"); 779 + } 780 + 781 + /** 782 + * fbnic_fw_xmit_coredump_info_msg - Create and transmit a coredump info message 783 + * @fbd: FBNIC device structure 784 + * @cmpl_data: Structure to store info in 785 + * @force: Force coredump event if one hasn't already occurred 786 + * 787 + * Return: zero on success, negative errno on failure 788 + * 789 + * Asks the FW for info related to coredump. If a coredump doesn't exist it 790 + * can optionally force one if force is true. 791 + */ 792 + int fbnic_fw_xmit_coredump_info_msg(struct fbnic_dev *fbd, 793 + struct fbnic_fw_completion *cmpl_data, 794 + bool force) 795 + { 796 + struct fbnic_tlv_msg *msg; 797 + int err = 0; 798 + 799 + msg = fbnic_tlv_msg_alloc(FBNIC_TLV_MSG_ID_COREDUMP_GET_INFO_REQ); 800 + if (!msg) 801 + return -ENOMEM; 802 + 803 + if (force) { 804 + err = fbnic_tlv_attr_put_flag(msg, FBNIC_FW_COREDUMP_REQ_INFO_CREATE); 805 + if (err) 806 + goto free_msg; 807 + } 808 + 809 + err = fbnic_mbx_map_req_w_cmpl(fbd, msg, cmpl_data); 810 + if (err) 811 + goto free_msg; 812 + 813 + return 0; 814 + 815 + free_msg: 816 + free_page((unsigned long)msg); 817 + return err; 818 + } 819 + 820 + static const struct fbnic_tlv_index fbnic_coredump_info_resp_index[] = { 821 + FBNIC_TLV_ATTR_FLAG(FBNIC_FW_COREDUMP_INFO_AVAILABLE), 822 + FBNIC_TLV_ATTR_U32(FBNIC_FW_COREDUMP_INFO_SIZE), 823 + FBNIC_TLV_ATTR_S32(FBNIC_FW_COREDUMP_INFO_ERROR), 824 + FBNIC_TLV_ATTR_LAST 825 + }; 826 + 827 + static int 828 + fbnic_fw_parse_coredump_info_resp(void *opaque, struct fbnic_tlv_msg **results) 829 + { 830 + struct fbnic_fw_completion *cmpl_data; 831 + struct fbnic_dev *fbd = opaque; 832 + u32 msg_type; 833 + s32 err; 834 + 835 + /* Verify we have a completion pointer to provide with data */ 836 + msg_type = FBNIC_TLV_MSG_ID_COREDUMP_GET_INFO_RESP; 837 + cmpl_data = fbnic_fw_get_cmpl_by_type(fbd, msg_type); 838 + if (!cmpl_data) 839 + return -ENOSPC; 840 + 841 + err = fta_get_sint(results, FBNIC_FW_COREDUMP_INFO_ERROR); 842 + if (err) 843 + goto msg_err; 844 + 845 + if (!results[FBNIC_FW_COREDUMP_INFO_AVAILABLE]) { 846 + err = -ENOENT; 847 + goto msg_err; 848 + } 849 + 850 + cmpl_data->u.coredump_info.size = 851 + fta_get_uint(results, FBNIC_FW_COREDUMP_INFO_SIZE); 852 + 853 + msg_err: 854 + cmpl_data->result = err; 855 + complete(&cmpl_data->done); 856 + fbnic_fw_put_cmpl(cmpl_data); 857 + 858 + return err; 859 + } 860 + 861 + /** 862 + * fbnic_fw_xmit_coredump_read_msg - Create and transmit a coredump read request 863 + * @fbd: FBNIC device structure 864 + * @cmpl_data: Completion struct to store coredump 865 + * @offset: Offset into coredump requested 866 + * @length: Length of section of cordeump to fetch 867 + * 868 + * Return: zero on success, negative errno on failure 869 + * 870 + * Asks the firmware to provide a section of the cordeump back in a message. 871 + * The response will have an offset and size matching the values provided. 872 + */ 873 + int fbnic_fw_xmit_coredump_read_msg(struct fbnic_dev *fbd, 874 + struct fbnic_fw_completion *cmpl_data, 875 + u32 offset, u32 length) 876 + { 877 + struct fbnic_tlv_msg *msg; 878 + int err = 0; 879 + 880 + msg = fbnic_tlv_msg_alloc(FBNIC_TLV_MSG_ID_COREDUMP_READ_REQ); 881 + if (!msg) 882 + return -ENOMEM; 883 + 884 + if (offset) { 885 + err = fbnic_tlv_attr_put_int(msg, FBNIC_FW_COREDUMP_READ_OFFSET, 886 + offset); 887 + if (err) 888 + goto free_message; 889 + } 890 + 891 + if (length) { 892 + err = fbnic_tlv_attr_put_int(msg, FBNIC_FW_COREDUMP_READ_LENGTH, 893 + length); 894 + if (err) 895 + goto free_message; 896 + } 897 + 898 + err = fbnic_mbx_map_req_w_cmpl(fbd, msg, cmpl_data); 899 + if (err) 900 + goto free_message; 901 + 902 + return 0; 903 + 904 + free_message: 905 + free_page((unsigned long)msg); 906 + return err; 907 + } 908 + 909 + static const struct fbnic_tlv_index fbnic_coredump_resp_index[] = { 910 + FBNIC_TLV_ATTR_U32(FBNIC_FW_COREDUMP_READ_OFFSET), 911 + FBNIC_TLV_ATTR_U32(FBNIC_FW_COREDUMP_READ_LENGTH), 912 + FBNIC_TLV_ATTR_RAW_DATA(FBNIC_FW_COREDUMP_READ_DATA), 913 + FBNIC_TLV_ATTR_S32(FBNIC_FW_COREDUMP_READ_ERROR), 914 + FBNIC_TLV_ATTR_LAST 915 + }; 916 + 917 + static int fbnic_fw_parse_coredump_resp(void *opaque, 918 + struct fbnic_tlv_msg **results) 919 + { 920 + struct fbnic_fw_completion *cmpl_data; 921 + u32 index, last_offset, last_length; 922 + struct fbnic_dev *fbd = opaque; 923 + struct fbnic_tlv_msg *data_hdr; 924 + u32 length, offset; 925 + u32 msg_type; 926 + s32 err; 927 + 928 + /* Verify we have a completion pointer to provide with data */ 929 + msg_type = FBNIC_TLV_MSG_ID_COREDUMP_READ_RESP; 930 + cmpl_data = fbnic_fw_get_cmpl_by_type(fbd, msg_type); 931 + if (!cmpl_data) 932 + return -ENOSPC; 933 + 934 + err = fta_get_sint(results, FBNIC_FW_COREDUMP_READ_ERROR); 935 + if (err) 936 + goto msg_err; 937 + 938 + data_hdr = results[FBNIC_FW_COREDUMP_READ_DATA]; 939 + if (!data_hdr) { 940 + err = -ENODATA; 941 + goto msg_err; 942 + } 943 + 944 + offset = fta_get_uint(results, FBNIC_FW_COREDUMP_READ_OFFSET); 945 + length = fta_get_uint(results, FBNIC_FW_COREDUMP_READ_LENGTH); 946 + 947 + if (length > le16_to_cpu(data_hdr->hdr.len) - sizeof(u32)) { 948 + dev_err(fbd->dev, "length greater than size of message\n"); 949 + err = -EINVAL; 950 + goto msg_err; 951 + } 952 + 953 + /* Only the last offset can have a length != stride */ 954 + last_length = 955 + (cmpl_data->u.coredump.size % cmpl_data->u.coredump.stride) ? : 956 + cmpl_data->u.coredump.stride; 957 + last_offset = cmpl_data->u.coredump.size - last_length; 958 + 959 + /* Verify offset and length */ 960 + if (offset % cmpl_data->u.coredump.stride || offset > last_offset) { 961 + dev_err(fbd->dev, "offset %d out of range\n", offset); 962 + err = -EINVAL; 963 + } else if (length != ((offset == last_offset) ? 964 + last_length : cmpl_data->u.coredump.stride)) { 965 + dev_err(fbd->dev, "length %d out of range for offset %d\n", 966 + length, offset); 967 + err = -EINVAL; 968 + } 969 + if (err) 970 + goto msg_err; 971 + 972 + /* If data pointer is NULL it is already filled, just skip the copy */ 973 + index = offset / cmpl_data->u.coredump.stride; 974 + if (!cmpl_data->u.coredump.data[index]) 975 + goto msg_err; 976 + 977 + /* Copy data and mark index filled by setting pointer to NULL */ 978 + memcpy(cmpl_data->u.coredump.data[index], 979 + fbnic_tlv_attr_get_value_ptr(data_hdr), length); 980 + cmpl_data->u.coredump.data[index] = NULL; 981 + 982 + msg_err: 983 + cmpl_data->result = err; 984 + complete(&cmpl_data->done); 985 + fbnic_fw_put_cmpl(cmpl_data); 986 + 987 + return err; 794 988 } 795 989 796 990 int fbnic_fw_xmit_fw_start_upgrade(struct fbnic_dev *fbd, ··· 1431 1207 fbnic_fw_parse_ownership_resp), 1432 1208 FBNIC_TLV_PARSER(HEARTBEAT_RESP, fbnic_heartbeat_resp_index, 1433 1209 fbnic_fw_parse_heartbeat_resp), 1210 + FBNIC_TLV_PARSER(COREDUMP_GET_INFO_RESP, 1211 + fbnic_coredump_info_resp_index, 1212 + fbnic_fw_parse_coredump_info_resp), 1213 + FBNIC_TLV_PARSER(COREDUMP_READ_RESP, fbnic_coredump_resp_index, 1214 + fbnic_fw_parse_coredump_resp), 1434 1215 FBNIC_TLV_PARSER(FW_START_UPGRADE_RESP, 1435 1216 fbnic_fw_start_upgrade_resp_index, 1436 1217 fbnic_fw_parse_fw_start_upgrade_resp), ··· 1758 1529 fw_version, str_sz); 1759 1530 } 1760 1531 1761 - struct fbnic_fw_completion *fbnic_fw_alloc_cmpl(u32 msg_type) 1532 + struct fbnic_fw_completion *__fbnic_fw_alloc_cmpl(u32 msg_type, 1533 + size_t priv_size) 1762 1534 { 1763 1535 struct fbnic_fw_completion *cmpl; 1764 1536 1765 - cmpl = kzalloc(sizeof(*cmpl), GFP_KERNEL); 1537 + cmpl = kzalloc(sizeof(*cmpl) + priv_size, GFP_KERNEL); 1766 1538 if (!cmpl) 1767 1539 return NULL; 1768 1540 ··· 1772 1542 kref_init(&cmpl->ref_count); 1773 1543 1774 1544 return cmpl; 1545 + } 1546 + 1547 + struct fbnic_fw_completion *fbnic_fw_alloc_cmpl(u32 msg_type) 1548 + { 1549 + return __fbnic_fw_alloc_cmpl(msg_type, 0); 1775 1550 } 1776 1551 1777 1552 void fbnic_fw_put_cmpl(struct fbnic_fw_completion *fw_cmpl)
+47
drivers/net/ethernet/meta/fbnic/fbnic_fw.h
··· 67 67 int result; 68 68 union { 69 69 struct { 70 + u32 size; 71 + } coredump_info; 72 + struct { 73 + u32 size; 74 + u16 stride; 75 + u8 *data[]; 76 + } coredump; 77 + struct { 70 78 u32 offset; 71 79 u32 length; 72 80 } fw_update; ··· 97 89 int fbnic_fw_xmit_ownership_msg(struct fbnic_dev *fbd, bool take_ownership); 98 90 int fbnic_fw_init_heartbeat(struct fbnic_dev *fbd, bool poll); 99 91 void fbnic_fw_check_heartbeat(struct fbnic_dev *fbd); 92 + int fbnic_fw_xmit_coredump_info_msg(struct fbnic_dev *fbd, 93 + struct fbnic_fw_completion *cmpl_data, 94 + bool force); 95 + int fbnic_fw_xmit_coredump_read_msg(struct fbnic_dev *fbd, 96 + struct fbnic_fw_completion *cmpl_data, 97 + u32 offset, u32 length); 100 98 int fbnic_fw_xmit_fw_start_upgrade(struct fbnic_dev *fbd, 101 99 struct fbnic_fw_completion *cmpl_data, 102 100 unsigned int id, unsigned int len); ··· 114 100 int fbnic_fw_xmit_send_logs(struct fbnic_dev *fbd, bool enable, 115 101 bool send_log_history); 116 102 int fbnic_fw_xmit_rpc_macda_sync(struct fbnic_dev *fbd); 103 + struct fbnic_fw_completion *__fbnic_fw_alloc_cmpl(u32 msg_type, 104 + size_t priv_size); 117 105 struct fbnic_fw_completion *fbnic_fw_alloc_cmpl(u32 msg_type); 118 106 void fbnic_fw_put_cmpl(struct fbnic_fw_completion *cmpl_data); 119 107 ··· 151 135 FBNIC_TLV_MSG_ID_OWNERSHIP_RESP = 0x13, 152 136 FBNIC_TLV_MSG_ID_HEARTBEAT_REQ = 0x14, 153 137 FBNIC_TLV_MSG_ID_HEARTBEAT_RESP = 0x15, 138 + FBNIC_TLV_MSG_ID_COREDUMP_GET_INFO_REQ = 0x18, 139 + FBNIC_TLV_MSG_ID_COREDUMP_GET_INFO_RESP = 0x19, 140 + FBNIC_TLV_MSG_ID_COREDUMP_READ_REQ = 0x20, 141 + FBNIC_TLV_MSG_ID_COREDUMP_READ_RESP = 0x21, 154 142 FBNIC_TLV_MSG_ID_FW_START_UPGRADE_REQ = 0x22, 155 143 FBNIC_TLV_MSG_ID_FW_START_UPGRADE_RESP = 0x23, 156 144 FBNIC_TLV_MSG_ID_FW_WRITE_CHUNK_REQ = 0x24, ··· 218 198 219 199 enum { 220 200 FBNIC_FW_OWNERSHIP_FLAG = 0x0, 201 + FBNIC_FW_OWNERSHIP_TIME = 0x1, 221 202 FBNIC_FW_OWNERSHIP_MSG_MAX 203 + }; 204 + 205 + enum { 206 + FBNIC_FW_HEARTBEAT_UPTIME = 0x0, 207 + FBNIC_FW_HEARTBEAT_NUMBER_OF_MESSAGES = 0x1, 208 + FBNIC_FW_HEARTBEAT_MSG_MAX 209 + }; 210 + 211 + enum { 212 + FBNIC_FW_COREDUMP_REQ_INFO_CREATE = 0x0, 213 + FBNIC_FW_COREDUMP_REQ_INFO_MSG_MAX 214 + }; 215 + 216 + enum { 217 + FBNIC_FW_COREDUMP_INFO_AVAILABLE = 0x0, 218 + FBNIC_FW_COREDUMP_INFO_SIZE = 0x1, 219 + FBNIC_FW_COREDUMP_INFO_ERROR = 0x2, 220 + FBNIC_FW_COREDUMP_INFO_MSG_MAX 221 + }; 222 + 223 + enum { 224 + FBNIC_FW_COREDUMP_READ_OFFSET = 0x0, 225 + FBNIC_FW_COREDUMP_READ_LENGTH = 0x1, 226 + FBNIC_FW_COREDUMP_READ_DATA = 0x2, 227 + FBNIC_FW_COREDUMP_READ_ERROR = 0x3, 228 + FBNIC_FW_COREDUMP_READ_MSG_MAX 222 229 }; 223 230 224 231 enum {
+1 -1
drivers/net/ethernet/meta/fbnic/fbnic_fw_log.c
··· 72 72 } 73 73 74 74 int fbnic_fw_log_write(struct fbnic_dev *fbd, u64 index, u32 timestamp, 75 - char *msg) 75 + const char *msg) 76 76 { 77 77 struct fbnic_fw_log_entry *entry, *head, *tail, *next; 78 78 struct fbnic_fw_log *log = &fbd->fw_log;
+1 -1
drivers/net/ethernet/meta/fbnic/fbnic_fw_log.h
··· 41 41 int fbnic_fw_log_init(struct fbnic_dev *fbd); 42 42 void fbnic_fw_log_free(struct fbnic_dev *fbd); 43 43 int fbnic_fw_log_write(struct fbnic_dev *fbd, u64 index, u32 timestamp, 44 - char *msg); 44 + const char *msg); 45 45 #endif /* _FBNIC_FW_LOG_H_ */
+31 -8
drivers/net/ethernet/meta/fbnic/fbnic_pci.c
··· 167 167 fbnic_flush(fbn); 168 168 } 169 169 170 + static int fbnic_fw_config_after_crash(struct fbnic_dev *fbd) 171 + { 172 + if (fbnic_fw_xmit_ownership_msg(fbd, true)) { 173 + dev_err(fbd->dev, "NIC failed to take ownership\n"); 174 + 175 + return -1; 176 + } 177 + 178 + fbnic_rpc_reset_valid_entries(fbd); 179 + __fbnic_set_rx_mode(fbd); 180 + 181 + return 0; 182 + } 183 + 170 184 static void fbnic_health_check(struct fbnic_dev *fbd) 171 185 { 172 186 struct fbnic_fw_mbx *tx_mbx = &fbd->mbx[FBNIC_IPC_MBX_TX_IDX]; ··· 196 182 if (tx_mbx->head != tx_mbx->tail) 197 183 return; 198 184 199 - /* TBD: Need to add a more thorough recovery here. 200 - * Specifically I need to verify what all the firmware will have 201 - * changed since we had setup and it rebooted. May just need to 202 - * perform a down/up. For now we will just reclaim ownership so 203 - * the heartbeat can catch the next fault. 204 - */ 205 - fbnic_fw_xmit_ownership_msg(fbd, true); 185 + fbnic_devlink_fw_report(fbd, "Firmware crashed detected!"); 186 + fbnic_devlink_otp_check(fbd, "error detected after firmware recovery"); 187 + 188 + if (fbnic_fw_config_after_crash(fbd)) 189 + dev_err(fbd->dev, "Firmware recovery failed after crash\n"); 206 190 } 207 191 208 192 static void fbnic_service_task(struct work_struct *work) ··· 281 269 return -ENOMEM; 282 270 } 283 271 272 + err = fbnic_devlink_health_create(fbd); 273 + if (err) 274 + goto free_fbd; 275 + 284 276 /* Populate driver with hardware-specific info and handlers */ 285 277 fbd->max_num_queues = info->max_num_queues; 286 278 ··· 295 279 296 280 err = fbnic_alloc_irqs(fbd); 297 281 if (err) 298 - goto free_fbd; 282 + goto err_destroy_health; 299 283 300 284 err = fbnic_mac_init(fbd); 301 285 if (err) { ··· 322 306 err); 323 307 324 308 fbnic_devlink_register(fbd); 309 + fbnic_devlink_otp_check(fbd, "error detected during probe"); 325 310 fbnic_dbg_fbd_init(fbd); 326 311 327 312 /* Capture snapshot of hardware stats so netdev can calculate delta */ ··· 365 348 return 0; 366 349 free_irqs: 367 350 fbnic_free_irqs(fbd); 351 + err_destroy_health: 352 + fbnic_devlink_health_destroy(fbd); 368 353 free_fbd: 369 354 fbnic_devlink_free(fbd); 370 355 ··· 401 382 fbnic_fw_free_mbx(fbd); 402 383 fbnic_free_irqs(fbd); 403 384 385 + fbnic_devlink_health_destroy(fbd); 404 386 fbnic_devlink_free(fbd); 405 387 } 406 388 ··· 475 455 * log entries. 476 456 */ 477 457 fbnic_fw_log_enable(fbd, list_empty(&fbd->fw_log.entries)); 458 + 459 + /* Since the FW should be up, check if it reported OTP errors */ 460 + fbnic_devlink_otp_check(fbd, "error detected after PM resume"); 478 461 479 462 /* No netdev means there isn't a network interface to bring up */ 480 463 if (fbnic_init_failure(fbd))
+42 -15
drivers/net/ethernet/meta/fbnic/fbnic_rpc.c
··· 596 596 } 597 597 } 598 598 599 + static void fbnic_clear_valid_macda(struct fbnic_dev *fbd) 600 + { 601 + int idx; 602 + 603 + for (idx = ARRAY_SIZE(fbd->mac_addr); idx--;) { 604 + struct fbnic_mac_addr *mac_addr = &fbd->mac_addr[idx]; 605 + 606 + if (mac_addr->state == FBNIC_TCAM_S_VALID) { 607 + fbnic_clear_macda_entry(fbd, idx); 608 + 609 + mac_addr->state = FBNIC_TCAM_S_UPDATE; 610 + } 611 + } 612 + } 613 + 599 614 static void fbnic_write_macda_entry(struct fbnic_dev *fbd, unsigned int idx, 600 615 struct fbnic_mac_addr *mac_addr) 601 616 { ··· 1139 1124 } 1140 1125 } 1141 1126 1142 - void fbnic_clear_rules(struct fbnic_dev *fbd) 1127 + static void fbnic_clear_valid_act_tcam(struct fbnic_dev *fbd) 1143 1128 { 1144 - u32 dest = FIELD_PREP(FBNIC_RPC_ACT_TBL0_DEST_MASK, 1145 - FBNIC_RPC_ACT_TBL0_DEST_BMC); 1146 1129 int i = FBNIC_RPC_TCAM_ACT_NUM_ENTRIES - 1; 1147 1130 struct fbnic_act_tcam *act_tcam; 1148 1131 1132 + /* Work from the bottom up deleting all other rules from hardware */ 1133 + do { 1134 + act_tcam = &fbd->act_tcam[i]; 1135 + 1136 + if (act_tcam->state != FBNIC_TCAM_S_VALID) 1137 + continue; 1138 + 1139 + fbnic_clear_act_tcam(fbd, i); 1140 + act_tcam->state = FBNIC_TCAM_S_UPDATE; 1141 + } while (i--); 1142 + } 1143 + 1144 + void fbnic_clear_rules(struct fbnic_dev *fbd) 1145 + { 1149 1146 /* Clear MAC rules */ 1150 1147 fbnic_clear_macda(fbd); 1151 1148 ··· 1172 1145 * the interface back up. 1173 1146 */ 1174 1147 if (fbnic_bmc_present(fbd)) { 1148 + u32 dest = FIELD_PREP(FBNIC_RPC_ACT_TBL0_DEST_MASK, 1149 + FBNIC_RPC_ACT_TBL0_DEST_BMC); 1150 + int i = FBNIC_RPC_TCAM_ACT_NUM_ENTRIES - 1; 1151 + struct fbnic_act_tcam *act_tcam; 1152 + 1175 1153 act_tcam = &fbd->act_tcam[i]; 1176 1154 1177 1155 if (act_tcam->state == FBNIC_TCAM_S_VALID && ··· 1185 1153 wr32(fbd, FBNIC_RPC_ACT_TBL1(i), 0); 1186 1154 1187 1155 act_tcam->state = FBNIC_TCAM_S_UPDATE; 1188 - 1189 - i--; 1190 1156 } 1191 1157 } 1192 1158 1193 - /* Work from the bottom up deleting all other rules from hardware */ 1194 - do { 1195 - act_tcam = &fbd->act_tcam[i]; 1196 - 1197 - if (act_tcam->state != FBNIC_TCAM_S_VALID) 1198 - continue; 1199 - 1200 - fbnic_clear_act_tcam(fbd, i); 1201 - act_tcam->state = FBNIC_TCAM_S_UPDATE; 1202 - } while (i--); 1159 + fbnic_clear_valid_act_tcam(fbd); 1203 1160 } 1204 1161 1205 1162 static void fbnic_delete_act_tcam(struct fbnic_dev *fbd, unsigned int idx) ··· 1237 1216 else 1238 1217 fbnic_update_act_tcam(fbd, i); 1239 1218 } 1219 + } 1220 + 1221 + void fbnic_rpc_reset_valid_entries(struct fbnic_dev *fbd) 1222 + { 1223 + fbnic_clear_valid_act_tcam(fbd); 1224 + fbnic_clear_valid_macda(fbd); 1240 1225 }