Merge tag 'block-6.16-20250626' of git://git.kernel.dk/linux

+15 -11

block/genhd.c

··· 128 128 static void bdev_count_inflight_rw(struct block_device *part, 129 129 unsigned int inflight[2], bool mq_driver) 130 130 { 131 + int write = 0; 132 + int read = 0; 131 133 int cpu; 132 134 133 135 if (mq_driver) { 134 136 blk_mq_in_driver_rw(part, inflight); 135 - } else { 136 - for_each_possible_cpu(cpu) { 137 - inflight[READ] += part_stat_local_read_cpu( 138 - part, in_flight[READ], cpu); 139 - inflight[WRITE] += part_stat_local_read_cpu( 140 - part, in_flight[WRITE], cpu); 141 - } 137 + return; 142 138 } 143 139 144 - if (WARN_ON_ONCE((int)inflight[READ] < 0)) 145 - inflight[READ] = 0; 146 - if (WARN_ON_ONCE((int)inflight[WRITE] < 0)) 147 - inflight[WRITE] = 0; 140 + for_each_possible_cpu(cpu) { 141 + read += part_stat_local_read_cpu(part, in_flight[READ], cpu); 142 + write += part_stat_local_read_cpu(part, in_flight[WRITE], cpu); 143 + } 144 + 145 + /* 146 + * While iterating all CPUs, some IOs may be issued from a CPU already 147 + * traversed and complete on a CPU that has not yet been traversed, 148 + * causing the inflight number to be negative. 149 + */ 150 + inflight[READ] = read > 0 ? read : 0; 151 + inflight[WRITE] = write > 0 ? write : 0; 148 152 } 149 153 150 154 /**

+37 -12

drivers/block/ublk_drv.c

··· 1148 1148 blk_mq_end_request(req, res); 1149 1149 } 1150 1150 1151 - static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req, 1152 - int res, unsigned issue_flags) 1151 + static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io, 1152 + struct request *req) 1153 1153 { 1154 1154 /* read cmd first because req will overwrite it */ 1155 1155 struct io_uring_cmd *cmd = io->cmd; ··· 1164 1164 io->flags &= ~UBLK_IO_FLAG_ACTIVE; 1165 1165 1166 1166 io->req = req; 1167 + return cmd; 1168 + } 1169 + 1170 + static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req, 1171 + int res, unsigned issue_flags) 1172 + { 1173 + struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req); 1167 1174 1168 1175 /* tell ublksrv one io request is coming */ 1169 1176 io_uring_cmd_done(cmd, res, 0, issue_flags); ··· 1423 1416 return BLK_STS_OK; 1424 1417 } 1425 1418 1419 + static inline bool ublk_belong_to_same_batch(const struct ublk_io *io, 1420 + const struct ublk_io *io2) 1421 + { 1422 + return (io_uring_cmd_ctx_handle(io->cmd) == 1423 + io_uring_cmd_ctx_handle(io2->cmd)) && 1424 + (io->task == io2->task); 1425 + } 1426 + 1426 1427 static void ublk_queue_rqs(struct rq_list *rqlist) 1427 1428 { 1428 1429 struct rq_list requeue_list = { }; ··· 1442 1427 struct ublk_queue *this_q = req->mq_hctx->driver_data; 1443 1428 struct ublk_io *this_io = &this_q->ios[req->tag]; 1444 1429 1445 - if (io && io->task != this_io->task && !rq_list_empty(&submit_list)) 1430 + if (io && !ublk_belong_to_same_batch(io, this_io) && 1431 + !rq_list_empty(&submit_list)) 1446 1432 ublk_queue_cmd_list(io, &submit_list); 1447 1433 io = this_io; 1448 1434 ··· 2164 2148 return 0; 2165 2149 } 2166 2150 2167 - static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io) 2151 + static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io, 2152 + struct request *req) 2168 2153 { 2169 - struct request *req = io->req; 2170 - 2171 2154 /* 2172 2155 * We have handled UBLK_IO_NEED_GET_DATA command, 2173 2156 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just ··· 2193 2178 u32 cmd_op = cmd->cmd_op; 2194 2179 unsigned tag = ub_cmd->tag; 2195 2180 int ret = -EINVAL; 2181 + struct request *req; 2196 2182 2197 2183 pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n", 2198 2184 __func__, cmd->cmd_op, ub_cmd->q_id, tag, ··· 2252 2236 goto out; 2253 2237 break; 2254 2238 case UBLK_IO_NEED_GET_DATA: 2255 - io->addr = ub_cmd->addr; 2256 - if (!ublk_get_data(ubq, io)) 2257 - return -EIOCBQUEUED; 2258 - 2259 - return UBLK_IO_RES_OK; 2239 + /* 2240 + * ublk_get_data() may fail and fallback to requeue, so keep 2241 + * uring_cmd active first and prepare for handling new requeued 2242 + * request 2243 + */ 2244 + req = io->req; 2245 + ublk_fill_io_cmd(io, cmd, ub_cmd->addr); 2246 + io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; 2247 + if (likely(ublk_get_data(ubq, io, req))) { 2248 + __ublk_prep_compl_io_cmd(io, req); 2249 + return UBLK_IO_RES_OK; 2250 + } 2251 + break; 2260 2252 default: 2261 2253 goto out; 2262 2254 } ··· 2849 2825 if (copy_from_user(&info, argp, sizeof(info))) 2850 2826 return -EFAULT; 2851 2827 2852 - if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || info.nr_hw_queues > UBLK_MAX_NR_QUEUES) 2828 + if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth || 2829 + info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues) 2853 2830 return -EINVAL; 2854 2831 2855 2832 if (capable(CAP_SYS_ADMIN))

+42 -45

drivers/nvme/host/core.c

··· 2015 2015 } 2016 2016 2017 2017 2018 - static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns, 2019 - struct nvme_id_ns *id, struct queue_limits *lim, 2020 - u32 bs, u32 atomic_bs) 2018 + static u32 nvme_configure_atomic_write(struct nvme_ns *ns, 2019 + struct nvme_id_ns *id, struct queue_limits *lim, u32 bs) 2021 2020 { 2022 - unsigned int boundary = 0; 2021 + u32 atomic_bs, boundary = 0; 2023 2022 2024 - if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) { 2025 - if (le16_to_cpu(id->nabspf)) 2023 + /* 2024 + * We do not support an offset for the atomic boundaries. 2025 + */ 2026 + if (id->nabo) 2027 + return bs; 2028 + 2029 + if ((id->nsfeat & NVME_NS_FEAT_ATOMICS) && id->nawupf) { 2030 + /* 2031 + * Use the per-namespace atomic write unit when available. 2032 + */ 2033 + atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; 2034 + if (id->nabspf) 2026 2035 boundary = (le16_to_cpu(id->nabspf) + 1) * bs; 2036 + } else { 2037 + /* 2038 + * Use the controller wide atomic write unit. This sucks 2039 + * because the limit is defined in terms of logical blocks while 2040 + * namespaces can have different formats, and because there is 2041 + * no clear language in the specification prohibiting different 2042 + * values for different controllers in the subsystem. 2043 + */ 2044 + atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; 2027 2045 } 2046 + 2028 2047 lim->atomic_write_hw_max = atomic_bs; 2029 2048 lim->atomic_write_hw_boundary = boundary; 2030 2049 lim->atomic_write_hw_unit_min = bs; 2031 2050 lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs); 2032 2051 lim->features |= BLK_FEAT_ATOMIC_WRITES; 2052 + return atomic_bs; 2033 2053 } 2034 2054 2035 2055 static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl) ··· 2087 2067 valid = false; 2088 2068 } 2089 2069 2090 - atomic_bs = phys_bs = bs; 2091 - if (id->nabo == 0) { 2092 - /* 2093 - * Bit 1 indicates whether NAWUPF is defined for this namespace 2094 - * and whether it should be used instead of AWUPF. If NAWUPF == 2095 - * 0 then AWUPF must be used instead. 2096 - */ 2097 - if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) 2098 - atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; 2099 - else 2100 - atomic_bs = (1 + ns->ctrl->awupf) * bs; 2101 - 2102 - /* 2103 - * Set subsystem atomic bs. 2104 - */ 2105 - if (ns->ctrl->subsys->atomic_bs) { 2106 - if (atomic_bs != ns->ctrl->subsys->atomic_bs) { 2107 - dev_err_ratelimited(ns->ctrl->device, 2108 - "%s: Inconsistent Atomic Write Size, Namespace will not be added: Subsystem=%d bytes, Controller/Namespace=%d bytes\n", 2109 - ns->disk ? ns->disk->disk_name : "?", 2110 - ns->ctrl->subsys->atomic_bs, 2111 - atomic_bs); 2112 - } 2113 - } else 2114 - ns->ctrl->subsys->atomic_bs = atomic_bs; 2115 - 2116 - nvme_update_atomic_write_disk_info(ns, id, lim, bs, atomic_bs); 2117 - } 2070 + phys_bs = bs; 2071 + atomic_bs = nvme_configure_atomic_write(ns, id, lim, bs); 2118 2072 2119 2073 if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { 2120 2074 /* NPWG = Namespace Preferred Write Granularity */ ··· 2375 2381 nvme_set_chunk_sectors(ns, id, &lim); 2376 2382 if (!nvme_update_disk_info(ns, id, &lim)) 2377 2383 capacity = 0; 2378 - 2379 - /* 2380 - * Validate the max atomic write size fits within the subsystem's 2381 - * atomic write capabilities. 2382 - */ 2383 - if (lim.atomic_write_hw_max > ns->ctrl->subsys->atomic_bs) { 2384 - blk_mq_unfreeze_queue(ns->disk->queue, memflags); 2385 - ret = -ENXIO; 2386 - goto out; 2387 - } 2388 2384 2389 2385 nvme_config_discard(ns, &lim); 2390 2386 if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && ··· 3199 3215 memcpy(subsys->model, id->mn, sizeof(subsys->model)); 3200 3216 subsys->vendor_id = le16_to_cpu(id->vid); 3201 3217 subsys->cmic = id->cmic; 3218 + subsys->awupf = le16_to_cpu(id->awupf); 3202 3219 3203 3220 /* Versions prior to 1.4 don't necessarily report a valid type */ 3204 3221 if (id->cntrltype == NVME_CTRL_DISC || ··· 3537 3552 if (ret) 3538 3553 goto out_free; 3539 3554 } 3555 + 3556 + if (le16_to_cpu(id->awupf) != ctrl->subsys->awupf) { 3557 + dev_err_ratelimited(ctrl->device, 3558 + "inconsistent AWUPF, controller not added (%u/%u).\n", 3559 + le16_to_cpu(id->awupf), ctrl->subsys->awupf); 3560 + ret = -EINVAL; 3561 + goto out_free; 3562 + } 3563 + 3540 3564 memcpy(ctrl->subsys->firmware_rev, id->fr, 3541 3565 sizeof(ctrl->subsys->firmware_rev)); 3542 3566 ··· 3641 3647 dev_pm_qos_expose_latency_tolerance(ctrl->device); 3642 3648 else if (!ctrl->apst_enabled && prev_apst_enabled) 3643 3649 dev_pm_qos_hide_latency_tolerance(ctrl->device); 3644 - ctrl->awupf = le16_to_cpu(id->awupf); 3645 3650 out_free: 3646 3651 kfree(id); 3647 3652 return ret; ··· 4029 4036 list_add_tail_rcu(&ns->siblings, &head->list); 4030 4037 ns->head = head; 4031 4038 mutex_unlock(&ctrl->subsys->lock); 4039 + 4040 + #ifdef CONFIG_NVME_MULTIPATH 4041 + cancel_delayed_work(&head->remove_work); 4042 + #endif 4032 4043 return 0; 4033 4044 4034 4045 out_put_ns_head:

+1 -1

drivers/nvme/host/multipath.c

··· 1311 1311 */ 1312 1312 if (!try_module_get(THIS_MODULE)) 1313 1313 goto out; 1314 - queue_delayed_work(nvme_wq, &head->remove_work, 1314 + mod_delayed_work(nvme_wq, &head->remove_work, 1315 1315 head->delayed_removal_secs * HZ); 1316 1316 } else { 1317 1317 list_del_init(&head->entry);

+1 -2

drivers/nvme/host/nvme.h

··· 410 410 411 411 enum nvme_ctrl_type cntrltype; 412 412 enum nvme_dctype dctype; 413 - u16 awupf; /* 0's based value. */ 414 413 }; 415 414 416 415 static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl) ··· 442 443 u8 cmic; 443 444 enum nvme_subsys_type subtype; 444 445 u16 vendor_id; 446 + u16 awupf; /* 0's based value. */ 445 447 struct ida ns_ida; 446 448 #ifdef CONFIG_NVME_MULTIPATH 447 449 enum nvme_iopolicy iopolicy; 448 450 #endif 449 - u32 atomic_bs; 450 451 }; 451 452 452 453 /*

+26 -6

include/uapi/linux/ublk_cmd.h

··· 135 135 #define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS) 136 136 137 137 /* 138 - * zero copy requires 4k block size, and can remap ublk driver's io 139 - * request into ublksrv's vm space 138 + * ublk server can register data buffers for incoming I/O requests with a sparse 139 + * io_uring buffer table. The request buffer can then be used as the data buffer 140 + * for io_uring operations via the fixed buffer index. 141 + * Note that the ublk server can never directly access the request data memory. 142 + * 143 + * To use this feature, the ublk server must first register a sparse buffer 144 + * table on an io_uring instance. 145 + * When an incoming ublk request is received, the ublk server submits a 146 + * UBLK_U_IO_REGISTER_IO_BUF command to that io_uring instance. The 147 + * ublksrv_io_cmd's q_id and tag specify the request whose buffer to register 148 + * and addr is the index in the io_uring's buffer table to install the buffer. 149 + * SQEs can now be submitted to the io_uring to read/write the request's buffer 150 + * by enabling fixed buffers (e.g. using IORING_OP_{READ,WRITE}_FIXED or 151 + * IORING_URING_CMD_FIXED) and passing the registered buffer index in buf_index. 152 + * Once the last io_uring operation using the request's buffer has completed, 153 + * the ublk server submits a UBLK_U_IO_UNREGISTER_IO_BUF command with q_id, tag, 154 + * and addr again specifying the request buffer to unregister. 155 + * The ublk request is completed when its buffer is unregistered from all 156 + * io_uring instances and the ublk server issues UBLK_U_IO_COMMIT_AND_FETCH_REQ. 157 + * 158 + * Not available for UBLK_F_UNPRIVILEGED_DEV, as a ublk server can leak 159 + * uninitialized kernel memory by not reading into the full request buffer. 140 160 */ 141 161 #define UBLK_F_SUPPORT_ZERO_COPY (1ULL << 0) 142 162 ··· 470 450 __u64 sqe_addr) 471 451 { 472 452 struct ublk_auto_buf_reg reg = { 473 - .index = sqe_addr & 0xffff, 474 - .flags = (sqe_addr >> 16) & 0xff, 475 - .reserved0 = (sqe_addr >> 24) & 0xff, 476 - .reserved1 = sqe_addr >> 32, 453 + .index = (__u16)sqe_addr, 454 + .flags = (__u8)(sqe_addr >> 16), 455 + .reserved0 = (__u8)(sqe_addr >> 24), 456 + .reserved1 = (__u32)(sqe_addr >> 32), 477 457 }; 478 458 479 459 return reg;

+3 -2

tools/testing/selftests/ublk/test_stress_03.sh

··· 32 32 ublk_io_and_remove 8G -t null -q 4 -z & 33 33 ublk_io_and_remove 256M -t loop -q 4 -z "${UBLK_BACKFILES[0]}" & 34 34 ublk_io_and_remove 256M -t stripe -q 4 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & 35 + wait 35 36 36 37 if _have_feature "AUTO_BUF_REG"; then 37 38 ublk_io_and_remove 8G -t null -q 4 --auto_zc & 38 39 ublk_io_and_remove 256M -t loop -q 4 --auto_zc "${UBLK_BACKFILES[0]}" & 39 40 ublk_io_and_remove 256M -t stripe -q 4 --auto_zc "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & 40 41 ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback & 42 + wait 41 43 fi 42 - wait 43 44 44 45 if _have_feature "PER_IO_DAEMON"; then 45 46 ublk_io_and_remove 8G -t null -q 4 --auto_zc --nthreads 8 --per_io_tasks & 46 47 ublk_io_and_remove 256M -t loop -q 4 --auto_zc --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[0]}" & 47 48 ublk_io_and_remove 256M -t stripe -q 4 --auto_zc --nthreads 8 --per_io_tasks "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & 48 49 ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback --nthreads 8 --per_io_tasks & 50 + wait 49 51 fi 50 - wait 51 52 52 53 _cleanup_test "stress" 53 54 _show_result $TID $ERR_CODE

Configure Feed

Configure Feed