nvme-tcp: Fix I/O queue cpu spreading for multiple controllers

Since day-1 we are assigning the queue io_cpu very naively. We always
base the queue id (controller scope) and assign it its matching cpu
from the online mask. This works fine when the number of queues match
the number of cpu cores.

The problem starts when we have less queues than cpu cores. First, we
should take into account the mq_map and select a cpu within the cpus
that are assigned to this queue by the mq_map in order to minimize cross
numa cpu bouncing.

Second, even worse is that we don't take into account multiple
controllers may have assigned queues to a given cpu. As a result we may
simply compund more and more queues on the same set of cpus, which is
suboptimal.

We fix this by introducing global per-cpu counters that tracks the
number of queues assigned to each cpu, and we select the least used cpu
based on the mq_map and the per-cpu counters, and assign it as the queue
io_cpu.

The behavior for a single controller is slightly optimized by selecting
better cpu candidates by consulting with the mq_map, and multiple
controllers are spreading queues among cpu cores much better, resulting
in lower average cpu load, and less likelihood to hit hotspots.

Note that the accounting is not 100% perfect, but we don't need to be,
we're simply putting our best effort to select the best candidate cpu
core that we find at any given point.

Another byproduct is that every controller reset/reconnect may change
the queues io_cpu mapping, based on the current LRU accounting scheme.

Here is the baseline queue io_cpu assignment for 4 controllers, 2 queues
per controller, and 4 cpus on the host:
nvme1: queue 0: using cpu 0
nvme1: queue 1: using cpu 1
nvme2: queue 0: using cpu 0
nvme2: queue 1: using cpu 1
nvme3: queue 0: using cpu 0
nvme3: queue 1: using cpu 1
nvme4: queue 0: using cpu 0
nvme4: queue 1: using cpu 1

And this is the fixed io_cpu assignment:
nvme1: queue 0: using cpu 0
nvme1: queue 1: using cpu 2
nvme2: queue 0: using cpu 1
nvme2: queue 1: using cpu 3
nvme3: queue 0: using cpu 0
nvme3: queue 1: using cpu 2
nvme4: queue 0: using cpu 1
nvme4: queue 1: using cpu 3

Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver")
Suggested-by: Hannes Reinecke <hare@kernel.org>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
[fixed kbuild reported errors]
Signed-off-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>

authored by

Sagi Grimberg and committed by

Keith Busch 1 year ago 32193789 3ec5c62c

+57 -13

1 changed file

expand all

drivers

nvme

host

tcp.c

+57 -13

drivers/nvme/host/tcp.c

··· 54 54 "nvme TLS handshake timeout in seconds (default 10)"); 55 55 #endif 56 56 57 + static atomic_t nvme_tcp_cpu_queues[NR_CPUS]; 58 + 57 59 #ifdef CONFIG_DEBUG_LOCK_ALLOC 58 60 /* lockdep can detect a circular dependency of the form 59 61 * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock ··· 129 127 NVME_TCP_Q_ALLOCATED = 0, 130 128 NVME_TCP_Q_LIVE = 1, 131 129 NVME_TCP_Q_POLLING = 2, 130 + NVME_TCP_Q_IO_CPU_SET = 3, 132 131 }; 133 132 134 133 enum nvme_tcp_recv_state { ··· 1565 1562 ctrl->io_queues[HCTX_TYPE_POLL]; 1566 1563 } 1567 1564 1565 + /** 1566 + * Track the number of queues assigned to each cpu using a global per-cpu 1567 + * counter and select the least used cpu from the mq_map. Our goal is to spread 1568 + * different controllers I/O threads across different cpu cores. 1569 + * 1570 + * Note that the accounting is not 100% perfect, but we don't need to be, we're 1571 + * simply putting our best effort to select the best candidate cpu core that we 1572 + * find at any given point. 1573 + */ 1568 1574 static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue) 1569 1575 { 1570 1576 struct nvme_tcp_ctrl *ctrl = queue->ctrl; 1571 - int qid = nvme_tcp_queue_id(queue); 1572 - int n = 0; 1577 + struct blk_mq_tag_set *set = &ctrl->tag_set; 1578 + int qid = nvme_tcp_queue_id(queue) - 1; 1579 + unsigned int *mq_map = NULL; 1580 + int cpu, min_queues = INT_MAX, io_cpu; 1581 + 1582 + if (wq_unbound) 1583 + goto out; 1573 1584 1574 1585 if (nvme_tcp_default_queue(queue)) 1575 - n = qid - 1; 1586 + mq_map = set->map[HCTX_TYPE_DEFAULT].mq_map; 1576 1587 else if (nvme_tcp_read_queue(queue)) 1577 - n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1; 1588 + mq_map = set->map[HCTX_TYPE_READ].mq_map; 1578 1589 else if (nvme_tcp_poll_queue(queue)) 1579 - n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1580 - ctrl->io_queues[HCTX_TYPE_READ] - 1; 1581 - if (wq_unbound) 1582 - queue->io_cpu = WORK_CPU_UNBOUND; 1583 - else 1584 - queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); 1590 + mq_map = set->map[HCTX_TYPE_POLL].mq_map; 1591 + 1592 + if (WARN_ON(!mq_map)) 1593 + goto out; 1594 + 1595 + /* Search for the least used cpu from the mq_map */ 1596 + io_cpu = WORK_CPU_UNBOUND; 1597 + for_each_online_cpu(cpu) { 1598 + int num_queues = atomic_read(&nvme_tcp_cpu_queues[cpu]); 1599 + 1600 + if (mq_map[cpu] != qid) 1601 + continue; 1602 + if (num_queues < min_queues) { 1603 + io_cpu = cpu; 1604 + min_queues = num_queues; 1605 + } 1606 + } 1607 + if (io_cpu != WORK_CPU_UNBOUND) { 1608 + queue->io_cpu = io_cpu; 1609 + atomic_inc(&nvme_tcp_cpu_queues[io_cpu]); 1610 + set_bit(NVME_TCP_Q_IO_CPU_SET, &queue->flags); 1611 + } 1612 + out: 1613 + dev_dbg(ctrl->ctrl.device, "queue %d: using cpu %d\n", 1614 + qid, queue->io_cpu); 1585 1615 } 1586 1616 1587 1617 static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid) ··· 1758 1722 1759 1723 queue->sock->sk->sk_allocation = GFP_ATOMIC; 1760 1724 queue->sock->sk->sk_use_task_frag = false; 1761 - nvme_tcp_set_queue_io_cpu(queue); 1725 + queue->io_cpu = WORK_CPU_UNBOUND; 1762 1726 queue->request = NULL; 1763 1727 queue->data_remaining = 0; 1764 1728 queue->ddgst_remaining = 0; ··· 1880 1844 if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) 1881 1845 return; 1882 1846 1847 + if (test_and_clear_bit(NVME_TCP_Q_IO_CPU_SET, &queue->flags)) 1848 + atomic_dec(&nvme_tcp_cpu_queues[queue->io_cpu]); 1849 + 1883 1850 mutex_lock(&queue->queue_lock); 1884 1851 if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) 1885 1852 __nvme_tcp_stop_queue(queue); ··· 1917 1878 nvme_tcp_init_recv_ctx(queue); 1918 1879 nvme_tcp_setup_sock_ops(queue); 1919 1880 1920 - if (idx) 1881 + if (idx) { 1882 + nvme_tcp_set_queue_io_cpu(queue); 1921 1883 ret = nvmf_connect_io_queue(nctrl, idx); 1922 - else 1884 + } else 1923 1885 ret = nvmf_connect_admin_queue(nctrl); 1924 1886 1925 1887 if (!ret) { ··· 2889 2849 static int __init nvme_tcp_init_module(void) 2890 2850 { 2891 2851 unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS; 2852 + int cpu; 2892 2853 2893 2854 BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8); 2894 2855 BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72); ··· 2906 2865 nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", wq_flags, 0); 2907 2866 if (!nvme_tcp_wq) 2908 2867 return -ENOMEM; 2868 + 2869 + for_each_possible_cpu(cpu) 2870 + atomic_set(&nvme_tcp_cpu_queues[cpu], 0); 2909 2871 2910 2872 nvmf_register_transport(&nvme_tcp_transport); 2911 2873 return 0;

Configure Feed

Configure Feed