Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

RDMA/hns: Support drain SQ and RQ

Some ULPs, e.g. rpcrdma, rely on drain_qp() to ensure all outstanding
requests are completed before releasing related memory. If drain_qp()
fails, ULPs may release memory directly, and in-flight WRs may later be
flushed after the memory is freed, potentially leading to UAF.

drain_qp() failures can happen when HW enters an error state or is
reset. Add support to drain SQ and RQ in such cases by posting a
fake WR during reset, so the driver can process all remaining WRs in
sequence and generate corresponding completions.

Always invoke comp_handler() in drain process to ensure completions
are not lost under concurrency (e.g. concurrent post_send() and
reset, or QPs created during reset). If the CQ is already processed,
cancel any already scheduled comp_handler() to avoid concurrency
issues.

Signed-off-by: Chengchang Tang <tangchengchang@huawei.com>
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
Link: https://patch.msgid.link/20260108113032.856306-1-huangjunxian6@hisilicon.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>

authored by

Chengchang Tang and committed by
Leon Romanovsky
354e7a6d 5c3f795d

+166
+166
drivers/infiniband/hw/hns/hns_roce_hw_v2.c
··· 876 876 return ret; 877 877 } 878 878 879 + static int hns_roce_push_drain_wr(struct hns_roce_wq *wq, struct ib_cq *cq, 880 + u64 wr_id) 881 + { 882 + unsigned long flags; 883 + int ret = 0; 884 + 885 + spin_lock_irqsave(&wq->lock, flags); 886 + if (hns_roce_wq_overflow(wq, 1, cq)) { 887 + ret = -ENOMEM; 888 + goto out; 889 + } 890 + 891 + wq->wrid[wq->head & (wq->wqe_cnt - 1)] = wr_id; 892 + wq->head++; 893 + 894 + out: 895 + spin_unlock_irqrestore(&wq->lock, flags); 896 + return ret; 897 + } 898 + 899 + struct hns_roce_drain_cqe { 900 + struct ib_cqe cqe; 901 + struct completion done; 902 + }; 903 + 904 + static void hns_roce_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc) 905 + { 906 + struct hns_roce_drain_cqe *cqe = container_of(wc->wr_cqe, 907 + struct hns_roce_drain_cqe, 908 + cqe); 909 + complete(&cqe->done); 910 + } 911 + 912 + static void handle_drain_completion(struct ib_cq *ibcq, 913 + struct hns_roce_drain_cqe *drain, 914 + struct hns_roce_dev *hr_dev) 915 + { 916 + #define TIMEOUT (HZ / 10) 917 + struct hns_roce_cq *hr_cq = to_hr_cq(ibcq); 918 + unsigned long flags; 919 + bool triggered; 920 + 921 + if (ibcq->poll_ctx == IB_POLL_DIRECT) { 922 + while (wait_for_completion_timeout(&drain->done, TIMEOUT) <= 0) 923 + ib_process_cq_direct(ibcq, -1); 924 + return; 925 + } 926 + 927 + if (hr_dev->state < HNS_ROCE_DEVICE_STATE_RST_DOWN) 928 + goto waiting_done; 929 + 930 + spin_lock_irqsave(&hr_cq->lock, flags); 931 + triggered = hr_cq->is_armed; 932 + hr_cq->is_armed = 1; 933 + spin_unlock_irqrestore(&hr_cq->lock, flags); 934 + 935 + /* Triggered means this cq is processing or has been processed 936 + * by hns_roce_handle_device_err() or this function. We need to 937 + * cancel the already invoked comp_handler() to avoid concurrency. 938 + * If it has not been triggered, we can directly invoke 939 + * comp_handler(). 940 + */ 941 + if (triggered) { 942 + switch (ibcq->poll_ctx) { 943 + case IB_POLL_SOFTIRQ: 944 + irq_poll_disable(&ibcq->iop); 945 + irq_poll_enable(&ibcq->iop); 946 + break; 947 + case IB_POLL_WORKQUEUE: 948 + case IB_POLL_UNBOUND_WORKQUEUE: 949 + cancel_work_sync(&ibcq->work); 950 + break; 951 + default: 952 + WARN_ON_ONCE(1); 953 + } 954 + } 955 + 956 + if (ibcq->comp_handler) 957 + ibcq->comp_handler(ibcq, ibcq->cq_context); 958 + 959 + waiting_done: 960 + if (ibcq->comp_handler) 961 + wait_for_completion(&drain->done); 962 + } 963 + 964 + static void hns_roce_v2_drain_rq(struct ib_qp *ibqp) 965 + { 966 + struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); 967 + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; 968 + struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); 969 + struct hns_roce_drain_cqe rdrain = {}; 970 + const struct ib_recv_wr *bad_rwr; 971 + struct ib_cq *cq = ibqp->recv_cq; 972 + struct ib_recv_wr rwr = {}; 973 + int ret; 974 + 975 + ret = ib_modify_qp(ibqp, &attr, IB_QP_STATE); 976 + if (ret && hr_dev->state < HNS_ROCE_DEVICE_STATE_RST_DOWN) { 977 + ibdev_err_ratelimited(&hr_dev->ib_dev, 978 + "failed to modify qp during drain rq, ret = %d.\n", 979 + ret); 980 + return; 981 + } 982 + 983 + rwr.wr_cqe = &rdrain.cqe; 984 + rdrain.cqe.done = hns_roce_drain_qp_done; 985 + init_completion(&rdrain.done); 986 + 987 + if (hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN) 988 + ret = hns_roce_push_drain_wr(&hr_qp->rq, cq, rwr.wr_id); 989 + else 990 + ret = hns_roce_v2_post_recv(ibqp, &rwr, &bad_rwr); 991 + if (ret) { 992 + ibdev_err_ratelimited(&hr_dev->ib_dev, 993 + "failed to post recv for drain rq, ret = %d.\n", 994 + ret); 995 + return; 996 + } 997 + 998 + handle_drain_completion(cq, &rdrain, hr_dev); 999 + } 1000 + 1001 + static void hns_roce_v2_drain_sq(struct ib_qp *ibqp) 1002 + { 1003 + struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); 1004 + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; 1005 + struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); 1006 + struct hns_roce_drain_cqe sdrain = {}; 1007 + const struct ib_send_wr *bad_swr; 1008 + struct ib_cq *cq = ibqp->send_cq; 1009 + struct ib_rdma_wr swr = { 1010 + .wr = { 1011 + .next = NULL, 1012 + { .wr_cqe = &sdrain.cqe, }, 1013 + .opcode = IB_WR_RDMA_WRITE, 1014 + }, 1015 + }; 1016 + int ret; 1017 + 1018 + ret = ib_modify_qp(ibqp, &attr, IB_QP_STATE); 1019 + if (ret && hr_dev->state < HNS_ROCE_DEVICE_STATE_RST_DOWN) { 1020 + ibdev_err_ratelimited(&hr_dev->ib_dev, 1021 + "failed to modify qp during drain sq, ret = %d.\n", 1022 + ret); 1023 + return; 1024 + } 1025 + 1026 + sdrain.cqe.done = hns_roce_drain_qp_done; 1027 + init_completion(&sdrain.done); 1028 + 1029 + if (hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN) 1030 + ret = hns_roce_push_drain_wr(&hr_qp->sq, cq, swr.wr.wr_id); 1031 + else 1032 + ret = hns_roce_v2_post_send(ibqp, &swr.wr, &bad_swr); 1033 + if (ret) { 1034 + ibdev_err_ratelimited(&hr_dev->ib_dev, 1035 + "failed to post send for drain sq, ret = %d.\n", 1036 + ret); 1037 + return; 1038 + } 1039 + 1040 + handle_drain_completion(cq, &sdrain, hr_dev); 1041 + } 1042 + 879 1043 static void *get_srq_wqe_buf(struct hns_roce_srq *srq, u32 n) 880 1044 { 881 1045 return hns_roce_buf_offset(srq->buf_mtr.kmem, n << srq->wqe_shift); ··· 7204 7040 .post_send = hns_roce_v2_post_send, 7205 7041 .query_qp = hns_roce_v2_query_qp, 7206 7042 .req_notify_cq = hns_roce_v2_req_notify_cq, 7043 + .drain_rq = hns_roce_v2_drain_rq, 7044 + .drain_sq = hns_roce_v2_drain_sq, 7207 7045 }; 7208 7046 7209 7047 static const struct ib_device_ops hns_roce_v2_dev_srq_ops = {