nvme-tcp: try to send request in queue_rq context

Today, nvme-tcp automatically schedules a send request
to a workqueue context, which is 1 more than we'd need
in case the socket buffer is wide open.

However, because we have async send activity (as a result
of r2t, or write_space callbacks), we need to synchronize
sends from possibly multiple contexts (ideally all running
on the same cpu though).

Thus, we only try to send directly from queue_rq in cases:
1. the send_list is empty
2. we can send it synchronously (i.e. not from the RX path)
3. we run on the same cpu as the queue->io_cpu to avoid
contention on the send operation.

Proposed-by: Mark Wunderlich <mark.wunderlich@intel.com>
Signed-off-by: Mark Wunderlich <mark.wunderlich@intel.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Sagi Grimberg and committed by

Jens Axboe 6 years ago db5ad6b7 72e5d757

+32 -11

1 changed file

expand all

drivers

nvme

host

tcp.c

+32 -11

drivers/nvme/host/tcp.c

··· 76 76 int io_cpu; 77 77 78 78 spinlock_t lock; 79 + struct mutex send_mutex; 79 80 struct list_head send_list; 80 81 81 82 /* recv state */ ··· 133 132 static struct workqueue_struct *nvme_tcp_wq; 134 133 static struct blk_mq_ops nvme_tcp_mq_ops; 135 134 static struct blk_mq_ops nvme_tcp_admin_mq_ops; 135 + static int nvme_tcp_try_send(struct nvme_tcp_queue *queue); 136 136 137 137 static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl) 138 138 { ··· 260 258 } 261 259 } 262 260 263 - static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req) 261 + static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, 262 + bool sync) 264 263 { 265 264 struct nvme_tcp_queue *queue = req->queue; 265 + bool empty; 266 266 267 267 spin_lock(&queue->lock); 268 + empty = list_empty(&queue->send_list) && !queue->request; 268 269 list_add_tail(&req->entry, &queue->send_list); 269 270 spin_unlock(&queue->lock); 270 271 271 - queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); 272 + /* 273 + * if we're the first on the send_list and we can try to send 274 + * directly, otherwise queue io_work. Also, only do that if we 275 + * are on the same cpu, so we don't introduce contention. 276 + */ 277 + if (queue->io_cpu == smp_processor_id() && 278 + sync && empty && mutex_trylock(&queue->send_mutex)) { 279 + nvme_tcp_try_send(queue); 280 + mutex_unlock(&queue->send_mutex); 281 + } else { 282 + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); 283 + } 272 284 } 273 285 274 286 static inline struct nvme_tcp_request * ··· 595 579 req->state = NVME_TCP_SEND_H2C_PDU; 596 580 req->offset = 0; 597 581 598 - nvme_tcp_queue_request(req); 582 + nvme_tcp_queue_request(req, false); 599 583 600 584 return 0; 601 585 } ··· 1081 1065 bool pending = false; 1082 1066 int result; 1083 1067 1084 - result = nvme_tcp_try_send(queue); 1085 - if (result > 0) 1086 - pending = true; 1087 - else if (unlikely(result < 0)) 1088 - break; 1068 + if (mutex_trylock(&queue->send_mutex)) { 1069 + result = nvme_tcp_try_send(queue); 1070 + mutex_unlock(&queue->send_mutex); 1071 + if (result > 0) 1072 + pending = true; 1073 + else if (unlikely(result < 0)) 1074 + break; 1075 + } 1089 1076 1090 1077 result = nvme_tcp_try_recv(queue); 1091 1078 if (result > 0) ··· 1340 1321 queue->ctrl = ctrl; 1341 1322 INIT_LIST_HEAD(&queue->send_list); 1342 1323 spin_lock_init(&queue->lock); 1324 + mutex_init(&queue->send_mutex); 1343 1325 INIT_WORK(&queue->io_work, nvme_tcp_io_work); 1344 1326 queue->queue_size = queue_size; 1345 1327 ··· 1565 1545 set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; 1566 1546 set->reserved_tags = 2; /* connect + keep-alive */ 1567 1547 set->numa_node = NUMA_NO_NODE; 1548 + set->flags = BLK_MQ_F_BLOCKING; 1568 1549 set->cmd_size = sizeof(struct nvme_tcp_request); 1569 1550 set->driver_data = ctrl; 1570 1551 set->nr_hw_queues = 1; ··· 1577 1556 set->queue_depth = nctrl->sqsize + 1; 1578 1557 set->reserved_tags = 1; /* fabric connect */ 1579 1558 set->numa_node = NUMA_NO_NODE; 1580 - set->flags = BLK_MQ_F_SHOULD_MERGE; 1559 + set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; 1581 1560 set->cmd_size = sizeof(struct nvme_tcp_request); 1582 1561 set->driver_data = ctrl; 1583 1562 set->nr_hw_queues = nctrl->queue_count - 1; ··· 2136 2115 ctrl->async_req.curr_bio = NULL; 2137 2116 ctrl->async_req.data_len = 0; 2138 2117 2139 - nvme_tcp_queue_request(&ctrl->async_req); 2118 + nvme_tcp_queue_request(&ctrl->async_req, true); 2140 2119 } 2141 2120 2142 2121 static enum blk_eh_timer_return ··· 2267 2246 2268 2247 blk_mq_start_request(rq); 2269 2248 2270 - nvme_tcp_queue_request(req); 2249 + nvme_tcp_queue_request(req, true); 2271 2250 2272 2251 return BLK_STS_OK; 2273 2252 }

Configure Feed

Configure Feed