ublk: implement NUMA-aware memory allocation

Implement NUMA-friendly memory allocation for ublk driver to improve
performance on multi-socket systems.

This commit includes the following changes:

1. Rename __queues to queues, dropping the __ prefix since the field is
now accessed directly throughout the codebase rather than only through
the ublk_get_queue() helper.

2. Remove the queue_size field from struct ublk_device as it is no longer
needed.

3. Move queue allocation and deallocation into ublk_init_queue() and
ublk_deinit_queue() respectively, improving encapsulation. This
simplifies ublk_init_queues() and ublk_deinit_queues() to just
iterate and call the per-queue functions.

4. Add ublk_get_queue_numa_node() helper function to determine the
appropriate NUMA node for a queue by finding the first CPU mapped
to that queue via tag_set.map[HCTX_TYPE_DEFAULT].mq_map[] and
converting it to a NUMA node using cpu_to_node(). This function is
called internally by ublk_init_queue() to determine the allocation
node.

5. Allocate each queue structure on its local NUMA node using
kvzalloc_node() in ublk_init_queue().

6. Allocate the I/O command buffer on the same NUMA node using
alloc_pages_node().

This reduces memory access latency on multi-socket NUMA systems by
ensuring each queue's data structures are local to the CPUs that
access them.

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Ming Lei and committed by

Jens Axboe 7 months ago 529d4d63 011af85c

+53 -31

1 changed file

expand all

drivers

block

ublk_drv.c

+53 -31

drivers/block/ublk_drv.c

··· 209 209 struct ublk_device { 210 210 struct gendisk *ub_disk; 211 211 212 - char *__queues; 213 - 214 - unsigned int queue_size; 215 212 struct ublksrv_ctrl_dev_info dev_info; 216 213 217 214 struct blk_mq_tag_set tag_set; ··· 236 239 bool canceling; 237 240 pid_t ublksrv_tgid; 238 241 struct delayed_work exit_work; 242 + 243 + struct ublk_queue *queues[]; 239 244 }; 240 245 241 246 /* header of ublk_params */ ··· 780 781 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev, 781 782 int qid) 782 783 { 783 - return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]); 784 + return dev->queues[qid]; 784 785 } 785 786 786 787 static inline bool ublk_rq_has_data(const struct request *rq) ··· 2661 2662 2662 2663 static void ublk_deinit_queue(struct ublk_device *ub, int q_id) 2663 2664 { 2664 - int size = ublk_queue_cmd_buf_size(ub); 2665 - struct ublk_queue *ubq = ublk_get_queue(ub, q_id); 2666 - int i; 2665 + struct ublk_queue *ubq = ub->queues[q_id]; 2666 + int size, i; 2667 + 2668 + if (!ubq) 2669 + return; 2670 + 2671 + size = ublk_queue_cmd_buf_size(ub); 2667 2672 2668 2673 for (i = 0; i < ubq->q_depth; i++) { 2669 2674 struct ublk_io *io = &ubq->ios[i]; ··· 2679 2676 2680 2677 if (ubq->io_cmd_buf) 2681 2678 free_pages((unsigned long)ubq->io_cmd_buf, get_order(size)); 2679 + 2680 + kvfree(ubq); 2681 + ub->queues[q_id] = NULL; 2682 + } 2683 + 2684 + static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id) 2685 + { 2686 + unsigned int cpu; 2687 + 2688 + /* Find first CPU mapped to this queue */ 2689 + for_each_possible_cpu(cpu) { 2690 + if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id) 2691 + return cpu_to_node(cpu); 2692 + } 2693 + 2694 + return NUMA_NO_NODE; 2682 2695 } 2683 2696 2684 2697 static int ublk_init_queue(struct ublk_device *ub, int q_id) 2685 2698 { 2686 - struct ublk_queue *ubq = ublk_get_queue(ub, q_id); 2699 + int depth = ub->dev_info.queue_depth; 2700 + int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io); 2687 2701 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO; 2688 - void *ptr; 2702 + struct ublk_queue *ubq; 2703 + struct page *page; 2704 + int numa_node; 2689 2705 int size; 2706 + 2707 + /* Determine NUMA node based on queue's CPU affinity */ 2708 + numa_node = ublk_get_queue_numa_node(ub, q_id); 2709 + 2710 + /* Allocate queue structure on local NUMA node */ 2711 + ubq = kvzalloc_node(ubq_size, GFP_KERNEL, numa_node); 2712 + if (!ubq) 2713 + return -ENOMEM; 2690 2714 2691 2715 spin_lock_init(&ubq->cancel_lock); 2692 2716 ubq->flags = ub->dev_info.flags; 2693 2717 ubq->q_id = q_id; 2694 - ubq->q_depth = ub->dev_info.queue_depth; 2718 + ubq->q_depth = depth; 2695 2719 size = ublk_queue_cmd_buf_size(ub); 2696 2720 2697 - ptr = (void *) __get_free_pages(gfp_flags, get_order(size)); 2698 - if (!ptr) 2721 + /* Allocate I/O command buffer on local NUMA node */ 2722 + page = alloc_pages_node(numa_node, gfp_flags, get_order(size)); 2723 + if (!page) { 2724 + kvfree(ubq); 2699 2725 return -ENOMEM; 2726 + } 2727 + ubq->io_cmd_buf = page_address(page); 2700 2728 2701 - ubq->io_cmd_buf = ptr; 2729 + ub->queues[q_id] = ubq; 2702 2730 ubq->dev = ub; 2703 2731 return 0; 2704 2732 } 2705 2733 2706 2734 static void ublk_deinit_queues(struct ublk_device *ub) 2707 2735 { 2708 - int nr_queues = ub->dev_info.nr_hw_queues; 2709 2736 int i; 2710 2737 2711 - if (!ub->__queues) 2712 - return; 2713 - 2714 - for (i = 0; i < nr_queues; i++) 2738 + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 2715 2739 ublk_deinit_queue(ub, i); 2716 - kvfree(ub->__queues); 2717 2740 } 2718 2741 2719 2742 static int ublk_init_queues(struct ublk_device *ub) 2720 2743 { 2721 - int nr_queues = ub->dev_info.nr_hw_queues; 2722 - int depth = ub->dev_info.queue_depth; 2723 - int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io); 2724 - int i, ret = -ENOMEM; 2744 + int i, ret; 2725 2745 2726 - ub->queue_size = ubq_size; 2727 - ub->__queues = kvcalloc(nr_queues, ubq_size, GFP_KERNEL); 2728 - if (!ub->__queues) 2729 - return ret; 2730 - 2731 - for (i = 0; i < nr_queues; i++) { 2732 - if (ublk_init_queue(ub, i)) 2746 + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { 2747 + ret = ublk_init_queue(ub, i); 2748 + if (ret) 2733 2749 goto fail; 2734 2750 } 2735 2751 ··· 3150 3128 goto out_unlock; 3151 3129 3152 3130 ret = -ENOMEM; 3153 - ub = kzalloc(sizeof(*ub), GFP_KERNEL); 3131 + ub = kzalloc(struct_size(ub, queues, info.nr_hw_queues), GFP_KERNEL); 3154 3132 if (!ub) 3155 3133 goto out_unlock; 3156 3134 mutex_init(&ub->mutex);

Configure Feed

Configure Feed