Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at master 5888 lines 152 kB view raw
1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 * Userspace block device - block device which IO is handled from userspace 4 * 5 * Take full use of io_uring passthrough command for communicating with 6 * ublk userspace daemon(ublksrvd) for handling basic IO request. 7 * 8 * Copyright 2022 Ming Lei <ming.lei@redhat.com> 9 * 10 * (part of code stolen from loop.c) 11 */ 12#include <linux/module.h> 13#include <linux/moduleparam.h> 14#include <linux/sched.h> 15#include <linux/fs.h> 16#include <linux/pagemap.h> 17#include <linux/file.h> 18#include <linux/stat.h> 19#include <linux/errno.h> 20#include <linux/major.h> 21#include <linux/wait.h> 22#include <linux/blkdev.h> 23#include <linux/init.h> 24#include <linux/swap.h> 25#include <linux/slab.h> 26#include <linux/compat.h> 27#include <linux/mutex.h> 28#include <linux/writeback.h> 29#include <linux/completion.h> 30#include <linux/highmem.h> 31#include <linux/sysfs.h> 32#include <linux/miscdevice.h> 33#include <linux/falloc.h> 34#include <linux/uio.h> 35#include <linux/ioprio.h> 36#include <linux/sched/mm.h> 37#include <linux/uaccess.h> 38#include <linux/cdev.h> 39#include <linux/io_uring/cmd.h> 40#include <linux/blk-mq.h> 41#include <linux/delay.h> 42#include <linux/mm.h> 43#include <asm/page.h> 44#include <linux/task_work.h> 45#include <linux/namei.h> 46#include <linux/kref.h> 47#include <linux/kfifo.h> 48#include <linux/blk-integrity.h> 49#include <linux/maple_tree.h> 50#include <linux/xarray.h> 51#include <uapi/linux/fs.h> 52#include <uapi/linux/ublk_cmd.h> 53 54#define UBLK_MINORS (1U << MINORBITS) 55 56#define UBLK_INVALID_BUF_IDX ((u16)-1) 57 58/* private ioctl command mirror */ 59#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) 60#define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE) 61#define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV) 62#define UBLK_CMD_TRY_STOP_DEV _IOC_NR(UBLK_U_CMD_TRY_STOP_DEV) 63#define UBLK_CMD_REG_BUF _IOC_NR(UBLK_U_CMD_REG_BUF) 64#define UBLK_CMD_UNREG_BUF _IOC_NR(UBLK_U_CMD_UNREG_BUF) 65 66/* Default max shmem buffer size: 4GB (may be increased in future) */ 67#define UBLK_SHMEM_BUF_SIZE_MAX (1ULL << 32) 68 69#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) 70#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) 71 72/* All UBLK_F_* have to be included into UBLK_F_ALL */ 73#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ 74 | UBLK_F_URING_CMD_COMP_IN_TASK \ 75 | UBLK_F_NEED_GET_DATA \ 76 | UBLK_F_USER_RECOVERY \ 77 | UBLK_F_USER_RECOVERY_REISSUE \ 78 | UBLK_F_UNPRIVILEGED_DEV \ 79 | UBLK_F_CMD_IOCTL_ENCODE \ 80 | UBLK_F_USER_COPY \ 81 | UBLK_F_ZONED \ 82 | UBLK_F_USER_RECOVERY_FAIL_IO \ 83 | UBLK_F_UPDATE_SIZE \ 84 | UBLK_F_AUTO_BUF_REG \ 85 | UBLK_F_QUIESCE \ 86 | UBLK_F_PER_IO_DAEMON \ 87 | UBLK_F_BUF_REG_OFF_DAEMON \ 88 | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \ 89 | UBLK_F_SAFE_STOP_DEV \ 90 | UBLK_F_BATCH_IO \ 91 | UBLK_F_NO_AUTO_PART_SCAN \ 92 | UBLK_F_SHMEM_ZC) 93 94#define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ 95 | UBLK_F_USER_RECOVERY_REISSUE \ 96 | UBLK_F_USER_RECOVERY_FAIL_IO) 97 98/* All UBLK_PARAM_TYPE_* should be included here */ 99#define UBLK_PARAM_TYPE_ALL \ 100 (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ 101 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \ 102 UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \ 103 UBLK_PARAM_TYPE_INTEGRITY) 104 105#define UBLK_BATCH_F_ALL \ 106 (UBLK_BATCH_F_HAS_ZONE_LBA | \ 107 UBLK_BATCH_F_HAS_BUF_ADDR | \ 108 UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) 109 110/* ublk batch fetch uring_cmd */ 111struct ublk_batch_fetch_cmd { 112 struct list_head node; 113 struct io_uring_cmd *cmd; 114 unsigned short buf_group; 115}; 116 117struct ublk_uring_cmd_pdu { 118 /* 119 * Store requests in same batch temporarily for queuing them to 120 * daemon context. 121 * 122 * It should have been stored to request payload, but we do want 123 * to avoid extra pre-allocation, and uring_cmd payload is always 124 * free for us 125 */ 126 union { 127 struct request *req; 128 struct request *req_list; 129 }; 130 131 /* 132 * The following two are valid in this cmd whole lifetime, and 133 * setup in ublk uring_cmd handler 134 */ 135 struct ublk_queue *ubq; 136 137 union { 138 u16 tag; 139 struct ublk_batch_fetch_cmd *fcmd; /* batch io only */ 140 }; 141}; 142 143struct ublk_batch_io_data { 144 struct ublk_device *ub; 145 struct io_uring_cmd *cmd; 146 struct ublk_batch_io header; 147 unsigned int issue_flags; 148 struct io_comp_batch *iob; 149}; 150 151/* 152 * io command is active: sqe cmd is received, and its cqe isn't done 153 * 154 * If the flag is set, the io command is owned by ublk driver, and waited 155 * for incoming blk-mq request from the ublk block device. 156 * 157 * If the flag is cleared, the io command will be completed, and owned by 158 * ublk server. 159 */ 160#define UBLK_IO_FLAG_ACTIVE 0x01 161 162/* 163 * IO command is completed via cqe, and it is being handled by ublksrv, and 164 * not committed yet 165 * 166 * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for 167 * cross verification 168 */ 169#define UBLK_IO_FLAG_OWNED_BY_SRV 0x02 170 171/* 172 * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires 173 * get data buffer address from ublksrv. 174 * 175 * Then, bio data could be copied into this data buffer for a WRITE request 176 * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset. 177 */ 178#define UBLK_IO_FLAG_NEED_GET_DATA 0x08 179 180/* 181 * request buffer is registered automatically, so we have to unregister it 182 * before completing this request. 183 * 184 * io_uring will unregister buffer automatically for us during exiting. 185 */ 186#define UBLK_IO_FLAG_AUTO_BUF_REG 0x10 187 188/* atomic RW with ubq->cancel_lock */ 189#define UBLK_IO_FLAG_CANCELED 0x80000000 190 191/* 192 * Initialize refcount to a large number to include any registered buffers. 193 * UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for 194 * any buffers registered on the io daemon task. 195 */ 196#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2) 197 198/* used for UBLK_F_BATCH_IO only */ 199#define UBLK_BATCH_IO_UNUSED_TAG ((unsigned short)-1) 200 201union ublk_io_buf { 202 __u64 addr; 203 struct ublk_auto_buf_reg auto_reg; 204}; 205 206struct ublk_io { 207 union ublk_io_buf buf; 208 unsigned int flags; 209 int res; 210 211 union { 212 /* valid if UBLK_IO_FLAG_ACTIVE is set */ 213 struct io_uring_cmd *cmd; 214 /* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */ 215 struct request *req; 216 }; 217 218 struct task_struct *task; 219 220 /* 221 * The number of uses of this I/O by the ublk server 222 * if user copy or zero copy are enabled: 223 * - UBLK_REFCOUNT_INIT from dispatch to the server 224 * until UBLK_IO_COMMIT_AND_FETCH_REQ 225 * - 1 for each inflight ublk_ch_{read,write}_iter() call not on task 226 * - 1 for each io_uring registered buffer not registered on task 227 * The I/O can only be completed once all references are dropped. 228 * User copy and buffer registration operations are only permitted 229 * if the reference count is nonzero. 230 */ 231 refcount_t ref; 232 /* Count of buffers registered on task and not yet unregistered */ 233 unsigned task_registered_buffers; 234 235 void *buf_ctx_handle; 236 spinlock_t lock; 237} ____cacheline_aligned_in_smp; 238 239struct ublk_queue { 240 int q_id; 241 int q_depth; 242 243 unsigned long flags; 244 struct ublksrv_io_desc *io_cmd_buf; 245 246 bool force_abort; 247 bool canceling; 248 bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ 249 spinlock_t cancel_lock; 250 struct ublk_device *dev; 251 u32 nr_io_ready; 252 253 /* 254 * For supporting UBLK_F_BATCH_IO only. 255 * 256 * Inflight ublk request tag is saved in this fifo 257 * 258 * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(), 259 * so lock is required for storing request tag to fifo 260 * 261 * Make sure just one reader for fetching request from task work 262 * function to ublk server, so no need to grab the lock in reader 263 * side. 264 * 265 * Batch I/O State Management: 266 * 267 * The batch I/O system uses implicit state management based on the 268 * combination of three key variables below. 269 * 270 * - IDLE: list_empty(&fcmd_head) && !active_fcmd 271 * No fetch commands available, events queue in evts_fifo 272 * 273 * - READY: !list_empty(&fcmd_head) && !active_fcmd 274 * Fetch commands available but none processing events 275 * 276 * - ACTIVE: active_fcmd 277 * One fetch command actively processing events from evts_fifo 278 * 279 * Key Invariants: 280 * - At most one active_fcmd at any time (single reader) 281 * - active_fcmd is always from fcmd_head list when non-NULL 282 * - evts_fifo can be read locklessly by the single active reader 283 * - All state transitions require evts_lock protection 284 * - Multiple writers to evts_fifo require lock protection 285 */ 286 struct { 287 DECLARE_KFIFO_PTR(evts_fifo, unsigned short); 288 spinlock_t evts_lock; 289 290 /* List of fetch commands available to process events */ 291 struct list_head fcmd_head; 292 293 /* Currently active fetch command (NULL = none active) */ 294 struct ublk_batch_fetch_cmd *active_fcmd; 295 }____cacheline_aligned_in_smp; 296 297 struct ublk_io ios[] __counted_by(q_depth); 298}; 299 300/* Maple tree value: maps a PFN range to buffer location */ 301struct ublk_buf_range { 302 unsigned short buf_index; 303 unsigned short flags; 304 unsigned int base_offset; /* byte offset within buffer */ 305}; 306 307struct ublk_device { 308 struct gendisk *ub_disk; 309 310 struct ublksrv_ctrl_dev_info dev_info; 311 312 struct blk_mq_tag_set tag_set; 313 314 struct cdev cdev; 315 struct device cdev_dev; 316 317#define UB_STATE_OPEN 0 318#define UB_STATE_USED 1 319#define UB_STATE_DELETED 2 320 unsigned long state; 321 int ub_number; 322 323 struct mutex mutex; 324 325 spinlock_t lock; 326 struct mm_struct *mm; 327 328 struct ublk_params params; 329 330 struct completion completion; 331 u32 nr_queue_ready; 332 bool unprivileged_daemons; 333 struct mutex cancel_mutex; 334 bool canceling; 335 pid_t ublksrv_tgid; 336 struct delayed_work exit_work; 337 struct work_struct partition_scan_work; 338 339 bool block_open; /* protected by open_mutex */ 340 341 /* shared memory zero copy */ 342 struct maple_tree buf_tree; 343 struct ida buf_ida; 344 345 struct ublk_queue *queues[]; 346}; 347 348/* header of ublk_params */ 349struct ublk_params_header { 350 __u32 len; 351 __u32 types; 352}; 353 354static void ublk_io_release(void *priv); 355static void ublk_stop_dev_unlocked(struct ublk_device *ub); 356static bool ublk_try_buf_match(struct ublk_device *ub, struct request *rq, 357 u32 *buf_idx, u32 *buf_off); 358static void ublk_buf_cleanup(struct ublk_device *ub); 359static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); 360static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, 361 u16 q_id, u16 tag, struct ublk_io *io); 362static inline unsigned int ublk_req_build_flags(struct request *req); 363static void ublk_batch_dispatch(struct ublk_queue *ubq, 364 const struct ublk_batch_io_data *data, 365 struct ublk_batch_fetch_cmd *fcmd); 366 367static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub) 368{ 369 return ub->dev_info.flags & UBLK_F_BATCH_IO; 370} 371 372static inline bool ublk_support_batch_io(const struct ublk_queue *ubq) 373{ 374 return ubq->flags & UBLK_F_BATCH_IO; 375} 376 377static inline void ublk_io_lock(struct ublk_io *io) 378{ 379 spin_lock(&io->lock); 380} 381 382static inline void ublk_io_unlock(struct ublk_io *io) 383{ 384 spin_unlock(&io->lock); 385} 386 387/* Initialize the event queue */ 388static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size, 389 int numa_node) 390{ 391 spin_lock_init(&q->evts_lock); 392 return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node); 393} 394 395/* Check if event queue is empty */ 396static inline bool ublk_io_evts_empty(const struct ublk_queue *q) 397{ 398 return kfifo_is_empty(&q->evts_fifo); 399} 400 401static inline void ublk_io_evts_deinit(struct ublk_queue *q) 402{ 403 WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo)); 404 kfifo_free(&q->evts_fifo); 405} 406 407static inline struct ublksrv_io_desc * 408ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) 409{ 410 return &ubq->io_cmd_buf[tag]; 411} 412 413static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq) 414{ 415 return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY; 416} 417 418static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub) 419{ 420 return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY; 421} 422 423static inline bool ublk_support_shmem_zc(const struct ublk_queue *ubq) 424{ 425 return ubq->flags & UBLK_F_SHMEM_ZC; 426} 427 428static inline bool ublk_iod_is_shmem_zc(const struct ublk_queue *ubq, 429 unsigned int tag) 430{ 431 return ublk_get_iod(ubq, tag)->op_flags & UBLK_IO_F_SHMEM_ZC; 432} 433 434static inline bool ublk_dev_support_shmem_zc(const struct ublk_device *ub) 435{ 436 return ub->dev_info.flags & UBLK_F_SHMEM_ZC; 437} 438 439static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq) 440{ 441 return ubq->flags & UBLK_F_AUTO_BUF_REG; 442} 443 444static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub) 445{ 446 return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG; 447} 448 449static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) 450{ 451 return ubq->flags & UBLK_F_USER_COPY; 452} 453 454static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub) 455{ 456 return ub->dev_info.flags & UBLK_F_USER_COPY; 457} 458 459static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) 460{ 461 return ub->dev_info.flags & UBLK_F_ZONED; 462} 463 464static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq) 465{ 466 return ubq->flags & UBLK_F_ZONED; 467} 468 469static inline bool ublk_dev_support_integrity(const struct ublk_device *ub) 470{ 471 return ub->dev_info.flags & UBLK_F_INTEGRITY; 472} 473 474#ifdef CONFIG_BLK_DEV_ZONED 475 476struct ublk_zoned_report_desc { 477 __u64 sector; 478 __u32 operation; 479 __u32 nr_zones; 480}; 481 482static DEFINE_XARRAY(ublk_zoned_report_descs); 483 484static int ublk_zoned_insert_report_desc(const struct request *req, 485 struct ublk_zoned_report_desc *desc) 486{ 487 return xa_insert(&ublk_zoned_report_descs, (unsigned long)req, 488 desc, GFP_KERNEL); 489} 490 491static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc( 492 const struct request *req) 493{ 494 return xa_erase(&ublk_zoned_report_descs, (unsigned long)req); 495} 496 497static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc( 498 const struct request *req) 499{ 500 return xa_load(&ublk_zoned_report_descs, (unsigned long)req); 501} 502 503static int ublk_get_nr_zones(const struct ublk_device *ub) 504{ 505 const struct ublk_param_basic *p = &ub->params.basic; 506 507 /* Zone size is a power of 2 */ 508 return p->dev_sectors >> ilog2(p->chunk_sectors); 509} 510 511static int ublk_revalidate_disk_zones(struct ublk_device *ub) 512{ 513 return blk_revalidate_disk_zones(ub->ub_disk); 514} 515 516static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) 517{ 518 const struct ublk_param_zoned *p = &ub->params.zoned; 519 int nr_zones; 520 521 if (!ublk_dev_is_zoned(ub)) 522 return -EINVAL; 523 524 if (!p->max_zone_append_sectors) 525 return -EINVAL; 526 527 nr_zones = ublk_get_nr_zones(ub); 528 529 if (p->max_active_zones > nr_zones) 530 return -EINVAL; 531 532 if (p->max_open_zones > nr_zones) 533 return -EINVAL; 534 535 return 0; 536} 537 538static void ublk_dev_param_zoned_apply(struct ublk_device *ub) 539{ 540 ub->ub_disk->nr_zones = ublk_get_nr_zones(ub); 541} 542 543/* Based on virtblk_alloc_report_buffer */ 544static void *ublk_alloc_report_buffer(struct ublk_device *ublk, 545 unsigned int nr_zones, size_t *buflen) 546{ 547 struct request_queue *q = ublk->ub_disk->queue; 548 size_t bufsize; 549 void *buf; 550 551 nr_zones = min_t(unsigned int, nr_zones, 552 ublk->ub_disk->nr_zones); 553 554 bufsize = nr_zones * sizeof(struct blk_zone); 555 bufsize = 556 min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT); 557 558 while (bufsize >= sizeof(struct blk_zone)) { 559 buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); 560 if (buf) { 561 *buflen = bufsize; 562 return buf; 563 } 564 bufsize >>= 1; 565 } 566 567 *buflen = 0; 568 return NULL; 569} 570 571static int ublk_report_zones(struct gendisk *disk, sector_t sector, 572 unsigned int nr_zones, struct blk_report_zones_args *args) 573{ 574 struct ublk_device *ub = disk->private_data; 575 unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors; 576 unsigned int first_zone = sector >> ilog2(zone_size_sectors); 577 unsigned int done_zones = 0; 578 unsigned int max_zones_per_request; 579 int ret; 580 struct blk_zone *buffer; 581 size_t buffer_length; 582 583 nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone, 584 nr_zones); 585 586 buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length); 587 if (!buffer) 588 return -ENOMEM; 589 590 max_zones_per_request = buffer_length / sizeof(struct blk_zone); 591 592 while (done_zones < nr_zones) { 593 unsigned int remaining_zones = nr_zones - done_zones; 594 unsigned int zones_in_request = 595 min_t(unsigned int, remaining_zones, max_zones_per_request); 596 struct request *req; 597 struct ublk_zoned_report_desc desc; 598 blk_status_t status; 599 600 memset(buffer, 0, buffer_length); 601 602 req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0); 603 if (IS_ERR(req)) { 604 ret = PTR_ERR(req); 605 goto out; 606 } 607 608 desc.operation = UBLK_IO_OP_REPORT_ZONES; 609 desc.sector = sector; 610 desc.nr_zones = zones_in_request; 611 ret = ublk_zoned_insert_report_desc(req, &desc); 612 if (ret) 613 goto free_req; 614 615 ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL); 616 if (ret) 617 goto erase_desc; 618 619 status = blk_execute_rq(req, 0); 620 ret = blk_status_to_errno(status); 621erase_desc: 622 ublk_zoned_erase_report_desc(req); 623free_req: 624 blk_mq_free_request(req); 625 if (ret) 626 goto out; 627 628 for (unsigned int i = 0; i < zones_in_request; i++) { 629 struct blk_zone *zone = buffer + i; 630 631 /* A zero length zone means no more zones in this response */ 632 if (!zone->len) 633 break; 634 635 ret = disk_report_zone(disk, zone, i, args); 636 if (ret) 637 goto out; 638 639 done_zones++; 640 sector += zone_size_sectors; 641 642 } 643 } 644 645 ret = done_zones; 646 647out: 648 kvfree(buffer); 649 return ret; 650} 651 652static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, 653 struct request *req) 654{ 655 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); 656 struct ublk_io *io = &ubq->ios[req->tag]; 657 struct ublk_zoned_report_desc *desc; 658 u32 ublk_op; 659 660 switch (req_op(req)) { 661 case REQ_OP_ZONE_OPEN: 662 ublk_op = UBLK_IO_OP_ZONE_OPEN; 663 break; 664 case REQ_OP_ZONE_CLOSE: 665 ublk_op = UBLK_IO_OP_ZONE_CLOSE; 666 break; 667 case REQ_OP_ZONE_FINISH: 668 ublk_op = UBLK_IO_OP_ZONE_FINISH; 669 break; 670 case REQ_OP_ZONE_RESET: 671 ublk_op = UBLK_IO_OP_ZONE_RESET; 672 break; 673 case REQ_OP_ZONE_APPEND: 674 ublk_op = UBLK_IO_OP_ZONE_APPEND; 675 break; 676 case REQ_OP_ZONE_RESET_ALL: 677 ublk_op = UBLK_IO_OP_ZONE_RESET_ALL; 678 break; 679 case REQ_OP_DRV_IN: 680 desc = ublk_zoned_get_report_desc(req); 681 if (!desc) 682 return BLK_STS_IOERR; 683 ublk_op = desc->operation; 684 switch (ublk_op) { 685 case UBLK_IO_OP_REPORT_ZONES: 686 iod->op_flags = ublk_op | ublk_req_build_flags(req); 687 iod->nr_zones = desc->nr_zones; 688 iod->start_sector = desc->sector; 689 return BLK_STS_OK; 690 default: 691 return BLK_STS_IOERR; 692 } 693 case REQ_OP_DRV_OUT: 694 /* We do not support drv_out */ 695 return BLK_STS_NOTSUPP; 696 default: 697 return BLK_STS_IOERR; 698 } 699 700 iod->op_flags = ublk_op | ublk_req_build_flags(req); 701 iod->nr_sectors = blk_rq_sectors(req); 702 iod->start_sector = blk_rq_pos(req); 703 iod->addr = io->buf.addr; 704 705 return BLK_STS_OK; 706} 707 708#else 709 710#define ublk_report_zones (NULL) 711 712static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) 713{ 714 return -EOPNOTSUPP; 715} 716 717static void ublk_dev_param_zoned_apply(struct ublk_device *ub) 718{ 719} 720 721static int ublk_revalidate_disk_zones(struct ublk_device *ub) 722{ 723 return 0; 724} 725 726static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, 727 struct request *req) 728{ 729 return BLK_STS_NOTSUPP; 730} 731 732#endif 733 734static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, 735 bool need_map, struct io_comp_batch *iob); 736 737static dev_t ublk_chr_devt; 738static const struct class ublk_chr_class = { 739 .name = "ublk-char", 740}; 741 742static DEFINE_IDR(ublk_index_idr); 743static DEFINE_SPINLOCK(ublk_idr_lock); 744static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ 745 746static DEFINE_MUTEX(ublk_ctl_mutex); 747 748static struct ublk_batch_fetch_cmd * 749ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd) 750{ 751 struct ublk_batch_fetch_cmd *fcmd = kzalloc_obj(*fcmd, GFP_NOIO); 752 753 if (fcmd) { 754 fcmd->cmd = cmd; 755 fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index); 756 } 757 return fcmd; 758} 759 760static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd) 761{ 762 kfree(fcmd); 763} 764 765static void __ublk_release_fcmd(struct ublk_queue *ubq) 766{ 767 WRITE_ONCE(ubq->active_fcmd, NULL); 768} 769 770/* 771 * Nothing can move on, so clear ->active_fcmd, and the caller should stop 772 * dispatching 773 */ 774static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq, 775 const struct ublk_batch_io_data *data, 776 struct ublk_batch_fetch_cmd *fcmd, 777 int res) 778{ 779 spin_lock(&ubq->evts_lock); 780 list_del_init(&fcmd->node); 781 WARN_ON_ONCE(fcmd != ubq->active_fcmd); 782 __ublk_release_fcmd(ubq); 783 spin_unlock(&ubq->evts_lock); 784 785 io_uring_cmd_done(fcmd->cmd, res, data->issue_flags); 786 ublk_batch_free_fcmd(fcmd); 787} 788 789static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd, 790 struct io_br_sel *sel, 791 unsigned int issue_flags) 792{ 793 if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags)) 794 return -ENOBUFS; 795 return 0; 796} 797 798static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd, 799 void __user *buf, const u16 *tag_buf, 800 unsigned int len) 801{ 802 if (copy_to_user(buf, tag_buf, len)) 803 return -EFAULT; 804 return len; 805} 806 807#define UBLK_MAX_UBLKS UBLK_MINORS 808 809/* 810 * Max unprivileged ublk devices allowed to add 811 * 812 * It can be extended to one per-user limit in future or even controlled 813 * by cgroup. 814 */ 815static unsigned int unprivileged_ublks_max = 64; 816static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */ 817 818static struct miscdevice ublk_misc; 819 820static inline unsigned ublk_pos_to_hwq(loff_t pos) 821{ 822 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) & 823 UBLK_QID_BITS_MASK; 824} 825 826static inline unsigned ublk_pos_to_buf_off(loff_t pos) 827{ 828 return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK; 829} 830 831static inline unsigned ublk_pos_to_tag(loff_t pos) 832{ 833 return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) & 834 UBLK_TAG_BITS_MASK; 835} 836 837static void ublk_dev_param_basic_apply(struct ublk_device *ub) 838{ 839 const struct ublk_param_basic *p = &ub->params.basic; 840 841 if (p->attrs & UBLK_ATTR_READ_ONLY) 842 set_disk_ro(ub->ub_disk, true); 843 844 set_capacity(ub->ub_disk, p->dev_sectors); 845} 846 847static int ublk_integrity_flags(u32 flags) 848{ 849 int ret_flags = BLK_SPLIT_INTERVAL_CAPABLE; 850 851 if (flags & LBMD_PI_CAP_INTEGRITY) { 852 flags &= ~LBMD_PI_CAP_INTEGRITY; 853 ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE; 854 } 855 if (flags & LBMD_PI_CAP_REFTAG) { 856 flags &= ~LBMD_PI_CAP_REFTAG; 857 ret_flags |= BLK_INTEGRITY_REF_TAG; 858 } 859 return flags ? -EINVAL : ret_flags; 860} 861 862static int ublk_integrity_pi_tuple_size(u8 csum_type) 863{ 864 switch (csum_type) { 865 case LBMD_PI_CSUM_NONE: 866 return 0; 867 case LBMD_PI_CSUM_IP: 868 case LBMD_PI_CSUM_CRC16_T10DIF: 869 return 8; 870 case LBMD_PI_CSUM_CRC64_NVME: 871 return 16; 872 default: 873 return -EINVAL; 874 } 875} 876 877static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type) 878{ 879 switch (csum_type) { 880 case LBMD_PI_CSUM_NONE: 881 return BLK_INTEGRITY_CSUM_NONE; 882 case LBMD_PI_CSUM_IP: 883 return BLK_INTEGRITY_CSUM_IP; 884 case LBMD_PI_CSUM_CRC16_T10DIF: 885 return BLK_INTEGRITY_CSUM_CRC; 886 case LBMD_PI_CSUM_CRC64_NVME: 887 return BLK_INTEGRITY_CSUM_CRC64; 888 default: 889 WARN_ON_ONCE(1); 890 return BLK_INTEGRITY_CSUM_NONE; 891 } 892} 893 894static int ublk_validate_params(const struct ublk_device *ub) 895{ 896 /* basic param is the only one which must be set */ 897 if (ub->params.types & UBLK_PARAM_TYPE_BASIC) { 898 const struct ublk_param_basic *p = &ub->params.basic; 899 900 if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9) 901 return -EINVAL; 902 903 if (p->logical_bs_shift > p->physical_bs_shift) 904 return -EINVAL; 905 906 if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9)) 907 return -EINVAL; 908 909 if (ublk_dev_is_zoned(ub) && !p->chunk_sectors) 910 return -EINVAL; 911 } else 912 return -EINVAL; 913 914 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) { 915 const struct ublk_param_discard *p = &ub->params.discard; 916 917 /* So far, only support single segment discard */ 918 if (p->max_discard_sectors && p->max_discard_segments != 1) 919 return -EINVAL; 920 921 if (!p->discard_granularity) 922 return -EINVAL; 923 } 924 925 /* dev_t is read-only */ 926 if (ub->params.types & UBLK_PARAM_TYPE_DEVT) 927 return -EINVAL; 928 929 if (ub->params.types & UBLK_PARAM_TYPE_ZONED) 930 return ublk_dev_param_zoned_validate(ub); 931 else if (ublk_dev_is_zoned(ub)) 932 return -EINVAL; 933 934 if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) { 935 const struct ublk_param_dma_align *p = &ub->params.dma; 936 937 if (p->alignment >= PAGE_SIZE) 938 return -EINVAL; 939 940 if (!is_power_of_2(p->alignment + 1)) 941 return -EINVAL; 942 } 943 944 if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) { 945 const struct ublk_param_segment *p = &ub->params.seg; 946 947 if (!is_power_of_2(p->seg_boundary_mask + 1)) 948 return -EINVAL; 949 950 if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE) 951 return -EINVAL; 952 if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE) 953 return -EINVAL; 954 } 955 956 if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) { 957 const struct ublk_param_integrity *p = &ub->params.integrity; 958 int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type); 959 int flags = ublk_integrity_flags(p->flags); 960 961 if (!ublk_dev_support_integrity(ub)) 962 return -EINVAL; 963 if (flags < 0) 964 return flags; 965 if (pi_tuple_size < 0) 966 return pi_tuple_size; 967 if (!p->metadata_size) 968 return -EINVAL; 969 if (p->csum_type == LBMD_PI_CSUM_NONE && 970 p->flags & LBMD_PI_CAP_REFTAG) 971 return -EINVAL; 972 if (p->pi_offset + pi_tuple_size > p->metadata_size) 973 return -EINVAL; 974 if (p->interval_exp < SECTOR_SHIFT || 975 p->interval_exp > ub->params.basic.logical_bs_shift) 976 return -EINVAL; 977 } 978 979 return 0; 980} 981 982static void ublk_apply_params(struct ublk_device *ub) 983{ 984 ublk_dev_param_basic_apply(ub); 985 986 if (ub->params.types & UBLK_PARAM_TYPE_ZONED) 987 ublk_dev_param_zoned_apply(ub); 988} 989 990static inline bool ublk_need_map_io(const struct ublk_queue *ubq) 991{ 992 return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) && 993 !ublk_support_auto_buf_reg(ubq); 994} 995 996static inline bool ublk_dev_need_map_io(const struct ublk_device *ub) 997{ 998 return !ublk_dev_support_user_copy(ub) && 999 !ublk_dev_support_zero_copy(ub) && 1000 !ublk_dev_support_auto_buf_reg(ub); 1001} 1002 1003static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) 1004{ 1005 /* 1006 * read()/write() is involved in user copy, so request reference 1007 * has to be grabbed 1008 * 1009 * for zero copy, request buffer need to be registered to io_uring 1010 * buffer table, so reference is needed 1011 * 1012 * For auto buffer register, ublk server still may issue 1013 * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up, 1014 * so reference is required too. 1015 */ 1016 return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) || 1017 ublk_support_auto_buf_reg(ubq); 1018} 1019 1020static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub) 1021{ 1022 return ublk_dev_support_user_copy(ub) || 1023 ublk_dev_support_zero_copy(ub) || 1024 ublk_dev_support_auto_buf_reg(ub); 1025} 1026 1027/* 1028 * ublk IO Reference Counting Design 1029 * ================================== 1030 * 1031 * For user-copy and zero-copy modes, ublk uses a split reference model with 1032 * two counters that together track IO lifetime: 1033 * 1034 * - io->ref: refcount for off-task buffer registrations and user-copy ops 1035 * - io->task_registered_buffers: count of buffers registered on the IO task 1036 * 1037 * Key Invariant: 1038 * -------------- 1039 * When IO is dispatched to the ublk server (UBLK_IO_FLAG_OWNED_BY_SRV set), 1040 * the sum (io->ref + io->task_registered_buffers) must equal UBLK_REFCOUNT_INIT 1041 * when no active references exist. After IO completion, both counters become 1042 * zero. For I/Os not currently dispatched to the ublk server, both ref and 1043 * task_registered_buffers are 0. 1044 * 1045 * This invariant is checked by ublk_check_and_reset_active_ref() during daemon 1046 * exit to determine if all references have been released. 1047 * 1048 * Why Split Counters: 1049 * ------------------- 1050 * Buffers registered on the IO daemon task can use the lightweight 1051 * task_registered_buffers counter (simple increment/decrement) instead of 1052 * atomic refcount operations. The ublk_io_release() callback checks if 1053 * current == io->task to decide which counter to update. 1054 * 1055 * This optimization only applies before IO completion. At completion, 1056 * ublk_sub_req_ref() collapses task_registered_buffers into the atomic ref. 1057 * After that, all subsequent buffer unregistrations must use the atomic ref 1058 * since they may be releasing the last reference. 1059 * 1060 * Reference Lifecycle: 1061 * -------------------- 1062 * 1. ublk_init_req_ref(): Sets io->ref = UBLK_REFCOUNT_INIT at IO dispatch 1063 * 1064 * 2. During IO processing: 1065 * - On-task buffer reg: task_registered_buffers++ (no ref change) 1066 * - Off-task buffer reg: ref++ via ublk_get_req_ref() 1067 * - Buffer unregister callback (ublk_io_release): 1068 * * If on-task: task_registered_buffers-- 1069 * * If off-task: ref-- via ublk_put_req_ref() 1070 * 1071 * 3. ublk_sub_req_ref() at IO completion: 1072 * - Computes: sub_refs = UBLK_REFCOUNT_INIT - task_registered_buffers 1073 * - Subtracts sub_refs from ref and zeroes task_registered_buffers 1074 * - This effectively collapses task_registered_buffers into the atomic ref, 1075 * accounting for the initial UBLK_REFCOUNT_INIT minus any on-task 1076 * buffers that were already counted 1077 * 1078 * Example (zero-copy, register on-task, unregister off-task): 1079 * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0 1080 * - Register buffer on-task: task_registered_buffers = 1 1081 * - Unregister off-task: ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1 1082 * - Completion via ublk_sub_req_ref(): 1083 * sub_refs = UBLK_REFCOUNT_INIT - 1, 1084 * ref = (UBLK_REFCOUNT_INIT - 1) - (UBLK_REFCOUNT_INIT - 1) = 0 1085 * 1086 * Example (auto buffer registration): 1087 * Auto buffer registration sets task_registered_buffers = 1 at dispatch. 1088 * 1089 * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 1 1090 * - Buffer unregister: task_registered_buffers-- (becomes 0) 1091 * - Completion via ublk_sub_req_ref(): 1092 * sub_refs = UBLK_REFCOUNT_INIT - 0, ref becomes 0 1093 * 1094 * Example (zero-copy, ublk server killed): 1095 * When daemon is killed, io_uring cleanup unregisters buffers off-task. 1096 * ublk_check_and_reset_active_ref() waits for the invariant to hold. 1097 * 1098 * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0 1099 * - Register buffer on-task: task_registered_buffers = 1 1100 * - Daemon killed, io_uring cleanup unregisters buffer (off-task): 1101 * ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1 1102 * - Daemon exit check: sum = (UBLK_REFCOUNT_INIT - 1) + 1 = UBLK_REFCOUNT_INIT 1103 * - Sum equals UBLK_REFCOUNT_INIT, then both two counters are zeroed by 1104 * ublk_check_and_reset_active_ref(), so ublk_abort_queue() can proceed 1105 * and abort pending requests 1106 * 1107 * Batch IO Special Case: 1108 * ---------------------- 1109 * In batch IO mode, io->task is NULL. This means ublk_io_release() always 1110 * takes the off-task path (ublk_put_req_ref), decrementing io->ref. The 1111 * task_registered_buffers counter still tracks registered buffers for the 1112 * invariant check, even though the callback doesn't decrement it. 1113 * 1114 * Note: updating task_registered_buffers is protected by io->lock. 1115 */ 1116static inline void ublk_init_req_ref(const struct ublk_queue *ubq, 1117 struct ublk_io *io) 1118{ 1119 if (ublk_need_req_ref(ubq)) 1120 refcount_set(&io->ref, UBLK_REFCOUNT_INIT); 1121} 1122 1123static inline bool ublk_get_req_ref(struct ublk_io *io) 1124{ 1125 return refcount_inc_not_zero(&io->ref); 1126} 1127 1128static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req) 1129{ 1130 if (!refcount_dec_and_test(&io->ref)) 1131 return; 1132 1133 /* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */ 1134 __ublk_complete_rq(req, io, false, NULL); 1135} 1136 1137static inline bool ublk_sub_req_ref(struct ublk_io *io) 1138{ 1139 unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers; 1140 1141 io->task_registered_buffers = 0; 1142 return refcount_sub_and_test(sub_refs, &io->ref); 1143} 1144 1145static inline bool ublk_need_get_data(const struct ublk_queue *ubq) 1146{ 1147 return ubq->flags & UBLK_F_NEED_GET_DATA; 1148} 1149 1150static inline bool ublk_dev_need_get_data(const struct ublk_device *ub) 1151{ 1152 return ub->dev_info.flags & UBLK_F_NEED_GET_DATA; 1153} 1154 1155/* Called in slow path only, keep it noinline for trace purpose */ 1156static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub) 1157{ 1158 if (kobject_get_unless_zero(&ub->cdev_dev.kobj)) 1159 return ub; 1160 return NULL; 1161} 1162 1163/* Called in slow path only, keep it noinline for trace purpose */ 1164static noinline void ublk_put_device(struct ublk_device *ub) 1165{ 1166 put_device(&ub->cdev_dev); 1167} 1168 1169static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev, 1170 int qid) 1171{ 1172 return dev->queues[qid]; 1173} 1174 1175static inline bool ublk_rq_has_data(const struct request *rq) 1176{ 1177 return bio_has_data(rq->bio); 1178} 1179 1180static inline struct ublksrv_io_desc * 1181ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) 1182{ 1183 return ublk_get_queue(ub, q_id)->io_cmd_buf; 1184} 1185 1186static inline int __ublk_queue_cmd_buf_size(int depth) 1187{ 1188 return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE); 1189} 1190 1191static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub) 1192{ 1193 return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth); 1194} 1195 1196static int ublk_max_cmd_buf_size(void) 1197{ 1198 return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH); 1199} 1200 1201/* 1202 * Should I/O outstanding to the ublk server when it exits be reissued? 1203 * If not, outstanding I/O will get errors. 1204 */ 1205static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub) 1206{ 1207 return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) && 1208 (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE); 1209} 1210 1211/* 1212 * Should I/O issued while there is no ublk server queue? If not, I/O 1213 * issued while there is no ublk server will get errors. 1214 */ 1215static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub) 1216{ 1217 return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) && 1218 !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO); 1219} 1220 1221/* 1222 * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy 1223 * of the device flags for smaller cache footprint - better for fast 1224 * paths. 1225 */ 1226static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq) 1227{ 1228 return (ubq->flags & UBLK_F_USER_RECOVERY) && 1229 !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO); 1230} 1231 1232/* 1233 * Should ublk devices be stopped (i.e. no recovery possible) when the 1234 * ublk server exits? If not, devices can be used again by a future 1235 * incarnation of a ublk server via the start_recovery/end_recovery 1236 * commands. 1237 */ 1238static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub) 1239{ 1240 return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY); 1241} 1242 1243static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub) 1244{ 1245 return ub->dev_info.state == UBLK_S_DEV_QUIESCED || 1246 ub->dev_info.state == UBLK_S_DEV_FAIL_IO; 1247} 1248 1249static void ublk_free_disk(struct gendisk *disk) 1250{ 1251 struct ublk_device *ub = disk->private_data; 1252 1253 clear_bit(UB_STATE_USED, &ub->state); 1254 ublk_put_device(ub); 1255} 1256 1257static void ublk_store_owner_uid_gid(unsigned int *owner_uid, 1258 unsigned int *owner_gid) 1259{ 1260 kuid_t uid; 1261 kgid_t gid; 1262 1263 current_uid_gid(&uid, &gid); 1264 1265 *owner_uid = from_kuid(&init_user_ns, uid); 1266 *owner_gid = from_kgid(&init_user_ns, gid); 1267} 1268 1269static int ublk_open(struct gendisk *disk, blk_mode_t mode) 1270{ 1271 struct ublk_device *ub = disk->private_data; 1272 1273 if (capable(CAP_SYS_ADMIN)) 1274 return 0; 1275 1276 /* 1277 * If it is one unprivileged device, only owner can open 1278 * the disk. Otherwise it could be one trap made by one 1279 * evil user who grants this disk's privileges to other 1280 * users deliberately. 1281 * 1282 * This way is reasonable too given anyone can create 1283 * unprivileged device, and no need other's grant. 1284 */ 1285 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) { 1286 unsigned int curr_uid, curr_gid; 1287 1288 ublk_store_owner_uid_gid(&curr_uid, &curr_gid); 1289 1290 if (curr_uid != ub->dev_info.owner_uid || curr_gid != 1291 ub->dev_info.owner_gid) 1292 return -EPERM; 1293 } 1294 1295 if (ub->block_open) 1296 return -ENXIO; 1297 1298 return 0; 1299} 1300 1301static const struct block_device_operations ub_fops = { 1302 .owner = THIS_MODULE, 1303 .open = ublk_open, 1304 .free_disk = ublk_free_disk, 1305 .report_zones = ublk_report_zones, 1306}; 1307 1308static bool ublk_copy_user_bvec(const struct bio_vec *bv, unsigned *offset, 1309 struct iov_iter *uiter, int dir, size_t *done) 1310{ 1311 unsigned len; 1312 void *bv_buf; 1313 size_t copied; 1314 1315 if (*offset >= bv->bv_len) { 1316 *offset -= bv->bv_len; 1317 return true; 1318 } 1319 1320 len = bv->bv_len - *offset; 1321 bv_buf = kmap_local_page(bv->bv_page) + bv->bv_offset + *offset; 1322 /* 1323 * Bio pages may originate from slab caches without a usercopy region 1324 * (e.g. jbd2 frozen metadata buffers). This is the same data that 1325 * the loop driver writes to its backing file — no exposure risk. 1326 * The bvec length is always trusted, so the size check in 1327 * check_copy_size() is not needed either. Use the unchecked 1328 * helpers to avoid false positives on slab pages. 1329 */ 1330 if (dir == ITER_DEST) 1331 copied = _copy_to_iter(bv_buf, len, uiter); 1332 else 1333 copied = _copy_from_iter(bv_buf, len, uiter); 1334 1335 kunmap_local(bv_buf); 1336 1337 *done += copied; 1338 if (copied < len) 1339 return false; 1340 1341 *offset = 0; 1342 return true; 1343} 1344 1345/* 1346 * Copy data between request pages and io_iter, and 'offset' 1347 * is the start point of linear offset of request. 1348 */ 1349static size_t ublk_copy_user_pages(const struct request *req, 1350 unsigned offset, struct iov_iter *uiter, int dir) 1351{ 1352 struct req_iterator iter; 1353 struct bio_vec bv; 1354 size_t done = 0; 1355 1356 rq_for_each_segment(bv, req, iter) { 1357 if (!ublk_copy_user_bvec(&bv, &offset, uiter, dir, &done)) 1358 break; 1359 } 1360 return done; 1361} 1362 1363#ifdef CONFIG_BLK_DEV_INTEGRITY 1364static size_t ublk_copy_user_integrity(const struct request *req, 1365 unsigned offset, struct iov_iter *uiter, int dir) 1366{ 1367 size_t done = 0; 1368 struct bio *bio = req->bio; 1369 struct bvec_iter iter; 1370 struct bio_vec iv; 1371 1372 if (!blk_integrity_rq(req)) 1373 return 0; 1374 1375 bio_for_each_integrity_vec(iv, bio, iter) { 1376 if (!ublk_copy_user_bvec(&iv, &offset, uiter, dir, &done)) 1377 break; 1378 } 1379 1380 return done; 1381} 1382#else /* #ifdef CONFIG_BLK_DEV_INTEGRITY */ 1383static size_t ublk_copy_user_integrity(const struct request *req, 1384 unsigned offset, struct iov_iter *uiter, int dir) 1385{ 1386 return 0; 1387} 1388#endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */ 1389 1390static inline bool ublk_need_map_req(const struct request *req) 1391{ 1392 return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE; 1393} 1394 1395static inline bool ublk_need_unmap_req(const struct request *req) 1396{ 1397 return ublk_rq_has_data(req) && 1398 (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN); 1399} 1400 1401static unsigned int ublk_map_io(const struct ublk_queue *ubq, 1402 const struct request *req, 1403 const struct ublk_io *io) 1404{ 1405 const unsigned int rq_bytes = blk_rq_bytes(req); 1406 1407 if (!ublk_need_map_io(ubq)) 1408 return rq_bytes; 1409 1410 /* 1411 * no zero copy, we delay copy WRITE request data into ublksrv 1412 * context and the big benefit is that pinning pages in current 1413 * context is pretty fast, see ublk_pin_user_pages 1414 */ 1415 if (ublk_need_map_req(req)) { 1416 struct iov_iter iter; 1417 const int dir = ITER_DEST; 1418 1419 import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter); 1420 return ublk_copy_user_pages(req, 0, &iter, dir); 1421 } 1422 return rq_bytes; 1423} 1424 1425static unsigned int ublk_unmap_io(bool need_map, 1426 const struct request *req, 1427 const struct ublk_io *io) 1428{ 1429 const unsigned int rq_bytes = blk_rq_bytes(req); 1430 1431 if (!need_map) 1432 return rq_bytes; 1433 1434 if (ublk_need_unmap_req(req)) { 1435 struct iov_iter iter; 1436 const int dir = ITER_SOURCE; 1437 1438 WARN_ON_ONCE(io->res > rq_bytes); 1439 1440 import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter); 1441 return ublk_copy_user_pages(req, 0, &iter, dir); 1442 } 1443 return rq_bytes; 1444} 1445 1446static inline unsigned int ublk_req_build_flags(struct request *req) 1447{ 1448 unsigned flags = 0; 1449 1450 if (req->cmd_flags & REQ_FAILFAST_DEV) 1451 flags |= UBLK_IO_F_FAILFAST_DEV; 1452 1453 if (req->cmd_flags & REQ_FAILFAST_TRANSPORT) 1454 flags |= UBLK_IO_F_FAILFAST_TRANSPORT; 1455 1456 if (req->cmd_flags & REQ_FAILFAST_DRIVER) 1457 flags |= UBLK_IO_F_FAILFAST_DRIVER; 1458 1459 if (req->cmd_flags & REQ_META) 1460 flags |= UBLK_IO_F_META; 1461 1462 if (req->cmd_flags & REQ_FUA) 1463 flags |= UBLK_IO_F_FUA; 1464 1465 if (req->cmd_flags & REQ_NOUNMAP) 1466 flags |= UBLK_IO_F_NOUNMAP; 1467 1468 if (req->cmd_flags & REQ_SWAP) 1469 flags |= UBLK_IO_F_SWAP; 1470 1471 if (blk_integrity_rq(req)) 1472 flags |= UBLK_IO_F_INTEGRITY; 1473 1474 return flags; 1475} 1476 1477static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) 1478{ 1479 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); 1480 struct ublk_io *io = &ubq->ios[req->tag]; 1481 u32 ublk_op; 1482 1483 switch (req_op(req)) { 1484 case REQ_OP_READ: 1485 ublk_op = UBLK_IO_OP_READ; 1486 break; 1487 case REQ_OP_WRITE: 1488 ublk_op = UBLK_IO_OP_WRITE; 1489 break; 1490 case REQ_OP_FLUSH: 1491 ublk_op = UBLK_IO_OP_FLUSH; 1492 break; 1493 case REQ_OP_DISCARD: 1494 ublk_op = UBLK_IO_OP_DISCARD; 1495 break; 1496 case REQ_OP_WRITE_ZEROES: 1497 ublk_op = UBLK_IO_OP_WRITE_ZEROES; 1498 break; 1499 default: 1500 if (ublk_queue_is_zoned(ubq)) 1501 return ublk_setup_iod_zoned(ubq, req); 1502 return BLK_STS_IOERR; 1503 } 1504 1505 /* need to translate since kernel may change */ 1506 iod->op_flags = ublk_op | ublk_req_build_flags(req); 1507 iod->nr_sectors = blk_rq_sectors(req); 1508 iod->start_sector = blk_rq_pos(req); 1509 1510 /* Try shmem zero-copy match before setting addr */ 1511 if (ublk_support_shmem_zc(ubq) && ublk_rq_has_data(req)) { 1512 u32 buf_idx, buf_off; 1513 1514 if (ublk_try_buf_match(ubq->dev, req, 1515 &buf_idx, &buf_off)) { 1516 iod->op_flags |= UBLK_IO_F_SHMEM_ZC; 1517 iod->addr = ublk_shmem_zc_addr(buf_idx, buf_off); 1518 return BLK_STS_OK; 1519 } 1520 } 1521 1522 iod->addr = io->buf.addr; 1523 1524 return BLK_STS_OK; 1525} 1526 1527static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu( 1528 struct io_uring_cmd *ioucmd) 1529{ 1530 return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu); 1531} 1532 1533static void ublk_end_request(struct request *req, blk_status_t error) 1534{ 1535 local_bh_disable(); 1536 blk_mq_end_request(req, error); 1537 local_bh_enable(); 1538} 1539 1540/* todo: handle partial completion */ 1541static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, 1542 bool need_map, struct io_comp_batch *iob) 1543{ 1544 unsigned int unmapped_bytes; 1545 blk_status_t res = BLK_STS_OK; 1546 bool requeue; 1547 1548 /* failed read IO if nothing is read */ 1549 if (!io->res && req_op(req) == REQ_OP_READ) 1550 io->res = -EIO; 1551 1552 if (io->res < 0) { 1553 res = errno_to_blk_status(io->res); 1554 goto exit; 1555 } 1556 1557 /* 1558 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them 1559 * directly. 1560 * 1561 * Both the two needn't unmap. 1562 */ 1563 if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE && 1564 req_op(req) != REQ_OP_DRV_IN) 1565 goto exit; 1566 1567 /* shmem zero copy: no data to unmap, pages already shared */ 1568 if (ublk_iod_is_shmem_zc(req->mq_hctx->driver_data, req->tag)) 1569 goto exit; 1570 1571 /* for READ request, writing data in iod->addr to rq buffers */ 1572 unmapped_bytes = ublk_unmap_io(need_map, req, io); 1573 1574 /* 1575 * Extremely impossible since we got data filled in just before 1576 * 1577 * Re-read simply for this unlikely case. 1578 */ 1579 if (unlikely(unmapped_bytes < io->res)) 1580 io->res = unmapped_bytes; 1581 1582 /* 1583 * Run bio->bi_end_io() with softirqs disabled. If the final fput 1584 * happens off this path, then that will prevent ublk's blkdev_release() 1585 * from being called on current's task work, see fput() implementation. 1586 * 1587 * Otherwise, ublk server may not provide forward progress in case of 1588 * reading the partition table from bdev_open() with disk->open_mutex 1589 * held, and causes dead lock as we could already be holding 1590 * disk->open_mutex here. 1591 * 1592 * Preferably we would not be doing IO with a mutex held that is also 1593 * used for release, but this work-around will suffice for now. 1594 */ 1595 local_bh_disable(); 1596 requeue = blk_update_request(req, BLK_STS_OK, io->res); 1597 local_bh_enable(); 1598 if (requeue) 1599 blk_mq_requeue_request(req, true); 1600 else if (likely(!blk_should_fake_timeout(req->q))) { 1601 if (blk_mq_add_to_batch(req, iob, false, blk_mq_end_request_batch)) 1602 return; 1603 __blk_mq_end_request(req, BLK_STS_OK); 1604 } 1605 1606 return; 1607exit: 1608 ublk_end_request(req, res); 1609} 1610 1611static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io, 1612 struct request *req) 1613{ 1614 /* read cmd first because req will overwrite it */ 1615 struct io_uring_cmd *cmd = io->cmd; 1616 1617 /* mark this cmd owned by ublksrv */ 1618 io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV; 1619 1620 /* 1621 * clear ACTIVE since we are done with this sqe/cmd slot 1622 * We can only accept io cmd in case of being not active. 1623 */ 1624 io->flags &= ~UBLK_IO_FLAG_ACTIVE; 1625 1626 io->req = req; 1627 return cmd; 1628} 1629 1630static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req, 1631 int res, unsigned issue_flags) 1632{ 1633 struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req); 1634 1635 /* tell ublksrv one io request is coming */ 1636 io_uring_cmd_done(cmd, res, issue_flags); 1637} 1638 1639#define UBLK_REQUEUE_DELAY_MS 3 1640 1641static inline void __ublk_abort_rq(struct ublk_queue *ubq, 1642 struct request *rq) 1643{ 1644 /* We cannot process this rq so just requeue it. */ 1645 if (ublk_nosrv_dev_should_queue_io(ubq->dev)) 1646 blk_mq_requeue_request(rq, false); 1647 else 1648 ublk_end_request(rq, BLK_STS_IOERR); 1649} 1650 1651static void 1652ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag) 1653{ 1654 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag); 1655 1656 iod->op_flags |= UBLK_IO_F_NEED_REG_BUF; 1657} 1658 1659enum auto_buf_reg_res { 1660 AUTO_BUF_REG_FAIL, 1661 AUTO_BUF_REG_FALLBACK, 1662 AUTO_BUF_REG_OK, 1663}; 1664 1665/* 1666 * Setup io state after auto buffer registration. 1667 * 1668 * Must be called after ublk_auto_buf_register() is done. 1669 * Caller must hold io->lock in batch context. 1670 */ 1671static void ublk_auto_buf_io_setup(const struct ublk_queue *ubq, 1672 struct request *req, struct ublk_io *io, 1673 struct io_uring_cmd *cmd, 1674 enum auto_buf_reg_res res) 1675{ 1676 if (res == AUTO_BUF_REG_OK) { 1677 io->task_registered_buffers = 1; 1678 io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd); 1679 io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG; 1680 } 1681 ublk_init_req_ref(ubq, io); 1682 __ublk_prep_compl_io_cmd(io, req); 1683} 1684 1685/* Register request bvec to io_uring for auto buffer registration. */ 1686static enum auto_buf_reg_res 1687ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req, 1688 struct ublk_io *io, struct io_uring_cmd *cmd, 1689 unsigned int issue_flags) 1690{ 1691 int ret; 1692 1693 ret = io_buffer_register_bvec(cmd, req, ublk_io_release, 1694 io->buf.auto_reg.index, issue_flags); 1695 if (ret) { 1696 if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) { 1697 ublk_auto_buf_reg_fallback(ubq, req->tag); 1698 return AUTO_BUF_REG_FALLBACK; 1699 } 1700 ublk_end_request(req, BLK_STS_IOERR); 1701 return AUTO_BUF_REG_FAIL; 1702 } 1703 1704 return AUTO_BUF_REG_OK; 1705} 1706 1707/* 1708 * Dispatch IO to userspace with auto buffer registration. 1709 * 1710 * Only called in non-batch context from task work, io->lock not held. 1711 */ 1712static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq, 1713 struct request *req, struct ublk_io *io, 1714 struct io_uring_cmd *cmd, 1715 unsigned int issue_flags) 1716{ 1717 enum auto_buf_reg_res res = ublk_auto_buf_register(ubq, req, io, cmd, 1718 issue_flags); 1719 1720 if (res != AUTO_BUF_REG_FAIL) { 1721 ublk_auto_buf_io_setup(ubq, req, io, cmd, res); 1722 io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags); 1723 } 1724} 1725 1726static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req, 1727 struct ublk_io *io) 1728{ 1729 unsigned mapped_bytes; 1730 1731 /* shmem zero copy: skip data copy, pages already shared */ 1732 if (ublk_iod_is_shmem_zc(ubq, req->tag)) 1733 return true; 1734 1735 mapped_bytes = ublk_map_io(ubq, req, io); 1736 1737 /* partially mapped, update io descriptor */ 1738 if (unlikely(mapped_bytes != blk_rq_bytes(req))) { 1739 /* 1740 * Nothing mapped, retry until we succeed. 1741 * 1742 * We may never succeed in mapping any bytes here because 1743 * of OOM. TODO: reserve one buffer with single page pinned 1744 * for providing forward progress guarantee. 1745 */ 1746 if (unlikely(!mapped_bytes)) { 1747 blk_mq_requeue_request(req, false); 1748 blk_mq_delay_kick_requeue_list(req->q, 1749 UBLK_REQUEUE_DELAY_MS); 1750 return false; 1751 } 1752 1753 ublk_get_iod(ubq, req->tag)->nr_sectors = 1754 mapped_bytes >> 9; 1755 } 1756 1757 return true; 1758} 1759 1760static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req) 1761{ 1762 unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; 1763 int tag = req->tag; 1764 struct ublk_io *io = &ubq->ios[tag]; 1765 1766 pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n", 1767 __func__, ubq->q_id, req->tag, io->flags, 1768 ublk_get_iod(ubq, req->tag)->addr); 1769 1770 /* 1771 * Task is exiting if either: 1772 * 1773 * (1) current != io->task. 1774 * io_uring_cmd_complete_in_task() tries to run task_work 1775 * in a workqueue if cmd's task is PF_EXITING. 1776 * 1777 * (2) current->flags & PF_EXITING. 1778 */ 1779 if (unlikely(current != io->task || current->flags & PF_EXITING)) { 1780 __ublk_abort_rq(ubq, req); 1781 return; 1782 } 1783 1784 if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) { 1785 /* 1786 * We have not handled UBLK_IO_NEED_GET_DATA command yet, 1787 * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv 1788 * and notify it. 1789 */ 1790 io->flags |= UBLK_IO_FLAG_NEED_GET_DATA; 1791 pr_devel("%s: need get data. qid %d tag %d io_flags %x\n", 1792 __func__, ubq->q_id, req->tag, io->flags); 1793 ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA, 1794 issue_flags); 1795 return; 1796 } 1797 1798 if (!ublk_start_io(ubq, req, io)) 1799 return; 1800 1801 if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) { 1802 ublk_auto_buf_dispatch(ubq, req, io, io->cmd, issue_flags); 1803 } else { 1804 ublk_init_req_ref(ubq, io); 1805 ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags); 1806 } 1807} 1808 1809static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq, 1810 const struct ublk_batch_io_data *data, 1811 unsigned short tag) 1812{ 1813 struct ublk_device *ub = data->ub; 1814 struct ublk_io *io = &ubq->ios[tag]; 1815 struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); 1816 enum auto_buf_reg_res res = AUTO_BUF_REG_FALLBACK; 1817 struct io_uring_cmd *cmd = data->cmd; 1818 1819 if (!ublk_start_io(ubq, req, io)) 1820 return false; 1821 1822 if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) { 1823 res = ublk_auto_buf_register(ubq, req, io, cmd, 1824 data->issue_flags); 1825 1826 if (res == AUTO_BUF_REG_FAIL) 1827 return false; 1828 } 1829 1830 ublk_io_lock(io); 1831 ublk_auto_buf_io_setup(ubq, req, io, cmd, res); 1832 ublk_io_unlock(io); 1833 1834 return true; 1835} 1836 1837static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq, 1838 const struct ublk_batch_io_data *data, 1839 unsigned short *tag_buf, 1840 unsigned int len) 1841{ 1842 bool has_unused = false; 1843 unsigned int i; 1844 1845 for (i = 0; i < len; i++) { 1846 unsigned short tag = tag_buf[i]; 1847 1848 if (!__ublk_batch_prep_dispatch(ubq, data, tag)) { 1849 tag_buf[i] = UBLK_BATCH_IO_UNUSED_TAG; 1850 has_unused = true; 1851 } 1852 } 1853 1854 return has_unused; 1855} 1856 1857/* 1858 * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf. 1859 * Returns the new length after filtering. 1860 */ 1861static noinline unsigned int ublk_filter_unused_tags(unsigned short *tag_buf, 1862 unsigned int len) 1863{ 1864 unsigned int i, j; 1865 1866 for (i = 0, j = 0; i < len; i++) { 1867 if (tag_buf[i] != UBLK_BATCH_IO_UNUSED_TAG) { 1868 if (i != j) 1869 tag_buf[j] = tag_buf[i]; 1870 j++; 1871 } 1872 } 1873 1874 return j; 1875} 1876 1877static noinline void ublk_batch_dispatch_fail(struct ublk_queue *ubq, 1878 const struct ublk_batch_io_data *data, 1879 unsigned short *tag_buf, size_t len, int ret) 1880{ 1881 int i, res; 1882 1883 /* 1884 * Undo prep state for all IOs since userspace never received them. 1885 * This restores IOs to pre-prepared state so they can be cleanly 1886 * re-prepared when tags are pulled from FIFO again. 1887 */ 1888 for (i = 0; i < len; i++) { 1889 struct ublk_io *io = &ubq->ios[tag_buf[i]]; 1890 int index = -1; 1891 1892 ublk_io_lock(io); 1893 if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) 1894 index = io->buf.auto_reg.index; 1895 io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG); 1896 io->flags |= UBLK_IO_FLAG_ACTIVE; 1897 ublk_io_unlock(io); 1898 1899 if (index != -1) 1900 io_buffer_unregister_bvec(data->cmd, index, 1901 data->issue_flags); 1902 } 1903 1904 res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo, 1905 tag_buf, len, &ubq->evts_lock); 1906 1907 pr_warn_ratelimited("%s: copy tags or post CQE failure, move back " 1908 "tags(%d %zu) ret %d\n", __func__, res, len, 1909 ret); 1910} 1911 1912#define MAX_NR_TAG 128 1913static int __ublk_batch_dispatch(struct ublk_queue *ubq, 1914 const struct ublk_batch_io_data *data, 1915 struct ublk_batch_fetch_cmd *fcmd) 1916{ 1917 const unsigned int tag_sz = sizeof(unsigned short); 1918 unsigned short tag_buf[MAX_NR_TAG]; 1919 struct io_br_sel sel; 1920 size_t len = 0; 1921 bool needs_filter; 1922 int ret; 1923 1924 WARN_ON_ONCE(data->cmd != fcmd->cmd); 1925 1926 sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len, 1927 data->issue_flags); 1928 if (sel.val < 0) 1929 return sel.val; 1930 if (!sel.addr) 1931 return -ENOBUFS; 1932 1933 /* single reader needn't lock and sizeof(kfifo element) is 2 bytes */ 1934 len = min(len, sizeof(tag_buf)) / tag_sz; 1935 len = kfifo_out(&ubq->evts_fifo, tag_buf, len); 1936 1937 needs_filter = ublk_batch_prep_dispatch(ubq, data, tag_buf, len); 1938 /* Filter out unused tags before posting to userspace */ 1939 if (unlikely(needs_filter)) { 1940 int new_len = ublk_filter_unused_tags(tag_buf, len); 1941 1942 /* return actual length if all are failed or requeued */ 1943 if (!new_len) { 1944 /* release the selected buffer */ 1945 sel.val = 0; 1946 WARN_ON_ONCE(!io_uring_mshot_cmd_post_cqe(fcmd->cmd, 1947 &sel, data->issue_flags)); 1948 return len; 1949 } 1950 len = new_len; 1951 } 1952 1953 sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz); 1954 ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags); 1955 if (unlikely(ret < 0)) 1956 ublk_batch_dispatch_fail(ubq, data, tag_buf, len, ret); 1957 return ret; 1958} 1959 1960static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd( 1961 struct ublk_queue *ubq) 1962{ 1963 struct ublk_batch_fetch_cmd *fcmd; 1964 1965 lockdep_assert_held(&ubq->evts_lock); 1966 1967 /* 1968 * Ordering updating ubq->evts_fifo and checking ubq->active_fcmd. 1969 * 1970 * The pair is the smp_mb() in ublk_batch_dispatch(). 1971 * 1972 * If ubq->active_fcmd is observed as non-NULL, the new added tags 1973 * can be visisible in ublk_batch_dispatch() with the barrier pairing. 1974 */ 1975 smp_mb(); 1976 if (READ_ONCE(ubq->active_fcmd)) { 1977 fcmd = NULL; 1978 } else { 1979 fcmd = list_first_entry_or_null(&ubq->fcmd_head, 1980 struct ublk_batch_fetch_cmd, node); 1981 WRITE_ONCE(ubq->active_fcmd, fcmd); 1982 } 1983 return fcmd; 1984} 1985 1986static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) 1987{ 1988 unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; 1989 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); 1990 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 1991 struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd; 1992 struct ublk_batch_io_data data = { 1993 .ub = pdu->ubq->dev, 1994 .cmd = fcmd->cmd, 1995 .issue_flags = issue_flags, 1996 }; 1997 1998 WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd); 1999 2000 ublk_batch_dispatch(pdu->ubq, &data, fcmd); 2001} 2002 2003static void 2004ublk_batch_dispatch(struct ublk_queue *ubq, 2005 const struct ublk_batch_io_data *data, 2006 struct ublk_batch_fetch_cmd *fcmd) 2007{ 2008 struct ublk_batch_fetch_cmd *new_fcmd; 2009 unsigned tried = 0; 2010 int ret = 0; 2011 2012again: 2013 while (!ublk_io_evts_empty(ubq)) { 2014 ret = __ublk_batch_dispatch(ubq, data, fcmd); 2015 if (ret <= 0) 2016 break; 2017 } 2018 2019 if (ret < 0) { 2020 ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret); 2021 return; 2022 } 2023 2024 __ublk_release_fcmd(ubq); 2025 /* 2026 * Order clearing ubq->active_fcmd from __ublk_release_fcmd() and 2027 * checking ubq->evts_fifo. 2028 * 2029 * The pair is the smp_mb() in __ublk_acquire_fcmd(). 2030 */ 2031 smp_mb(); 2032 if (likely(ublk_io_evts_empty(ubq))) 2033 return; 2034 2035 spin_lock(&ubq->evts_lock); 2036 new_fcmd = __ublk_acquire_fcmd(ubq); 2037 spin_unlock(&ubq->evts_lock); 2038 2039 if (!new_fcmd) 2040 return; 2041 2042 /* Avoid lockup by allowing to handle at most 32 batches */ 2043 if (new_fcmd == fcmd && tried++ < 32) 2044 goto again; 2045 2046 io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb); 2047} 2048 2049static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) 2050{ 2051 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); 2052 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 2053 struct ublk_queue *ubq = pdu->ubq; 2054 2055 ublk_dispatch_req(ubq, pdu->req); 2056} 2057 2058static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last) 2059{ 2060 unsigned short tag = rq->tag; 2061 struct ublk_batch_fetch_cmd *fcmd = NULL; 2062 2063 spin_lock(&ubq->evts_lock); 2064 kfifo_put(&ubq->evts_fifo, tag); 2065 if (last) 2066 fcmd = __ublk_acquire_fcmd(ubq); 2067 spin_unlock(&ubq->evts_lock); 2068 2069 if (fcmd) 2070 io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb); 2071} 2072 2073static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) 2074{ 2075 struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; 2076 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 2077 2078 pdu->req = rq; 2079 io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb); 2080} 2081 2082static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) 2083{ 2084 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); 2085 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 2086 struct request *rq = pdu->req_list; 2087 struct request *next; 2088 2089 do { 2090 next = rq->rq_next; 2091 rq->rq_next = NULL; 2092 ublk_dispatch_req(rq->mq_hctx->driver_data, rq); 2093 rq = next; 2094 } while (rq); 2095} 2096 2097static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l) 2098{ 2099 struct io_uring_cmd *cmd = io->cmd; 2100 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 2101 2102 pdu->req_list = rq_list_peek(l); 2103 rq_list_init(l); 2104 io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb); 2105} 2106 2107static enum blk_eh_timer_return ublk_timeout(struct request *rq) 2108{ 2109 struct ublk_queue *ubq = rq->mq_hctx->driver_data; 2110 pid_t tgid = ubq->dev->ublksrv_tgid; 2111 struct task_struct *p; 2112 struct pid *pid; 2113 2114 if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV)) 2115 return BLK_EH_RESET_TIMER; 2116 2117 if (unlikely(!tgid)) 2118 return BLK_EH_RESET_TIMER; 2119 2120 rcu_read_lock(); 2121 pid = find_vpid(tgid); 2122 p = pid_task(pid, PIDTYPE_PID); 2123 if (p) 2124 send_sig(SIGKILL, p, 0); 2125 rcu_read_unlock(); 2126 return BLK_EH_DONE; 2127} 2128 2129static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq, 2130 bool check_cancel) 2131{ 2132 blk_status_t res; 2133 2134 if (unlikely(READ_ONCE(ubq->fail_io))) 2135 return BLK_STS_TARGET; 2136 2137 /* With recovery feature enabled, force_abort is set in 2138 * ublk_stop_dev() before calling del_gendisk(). We have to 2139 * abort all requeued and new rqs here to let del_gendisk() 2140 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task() 2141 * to avoid UAF on io_uring ctx. 2142 * 2143 * Note: force_abort is guaranteed to be seen because it is set 2144 * before request queue is unqiuesced. 2145 */ 2146 if (ublk_nosrv_should_queue_io(ubq) && 2147 unlikely(READ_ONCE(ubq->force_abort))) 2148 return BLK_STS_IOERR; 2149 2150 if (check_cancel && unlikely(ubq->canceling)) 2151 return BLK_STS_IOERR; 2152 2153 /* fill iod to slot in io cmd buffer */ 2154 res = ublk_setup_iod(ubq, rq); 2155 if (unlikely(res != BLK_STS_OK)) 2156 return BLK_STS_IOERR; 2157 2158 blk_mq_start_request(rq); 2159 return BLK_STS_OK; 2160} 2161 2162/* 2163 * Common helper for queue_rq that handles request preparation and 2164 * cancellation checks. Returns status and sets should_queue to indicate 2165 * whether the caller should proceed with queuing the request. 2166 */ 2167static inline blk_status_t __ublk_queue_rq_common(struct ublk_queue *ubq, 2168 struct request *rq, 2169 bool *should_queue) 2170{ 2171 blk_status_t res; 2172 2173 res = ublk_prep_req(ubq, rq, false); 2174 if (res != BLK_STS_OK) { 2175 *should_queue = false; 2176 return res; 2177 } 2178 2179 /* 2180 * ->canceling has to be handled after ->force_abort and ->fail_io 2181 * is dealt with, otherwise this request may not be failed in case 2182 * of recovery, and cause hang when deleting disk 2183 */ 2184 if (unlikely(ubq->canceling)) { 2185 *should_queue = false; 2186 __ublk_abort_rq(ubq, rq); 2187 return BLK_STS_OK; 2188 } 2189 2190 *should_queue = true; 2191 return BLK_STS_OK; 2192} 2193 2194static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, 2195 const struct blk_mq_queue_data *bd) 2196{ 2197 struct ublk_queue *ubq = hctx->driver_data; 2198 struct request *rq = bd->rq; 2199 bool should_queue; 2200 blk_status_t res; 2201 2202 res = __ublk_queue_rq_common(ubq, rq, &should_queue); 2203 if (!should_queue) 2204 return res; 2205 2206 ublk_queue_cmd(ubq, rq); 2207 return BLK_STS_OK; 2208} 2209 2210static blk_status_t ublk_batch_queue_rq(struct blk_mq_hw_ctx *hctx, 2211 const struct blk_mq_queue_data *bd) 2212{ 2213 struct ublk_queue *ubq = hctx->driver_data; 2214 struct request *rq = bd->rq; 2215 bool should_queue; 2216 blk_status_t res; 2217 2218 res = __ublk_queue_rq_common(ubq, rq, &should_queue); 2219 if (!should_queue) 2220 return res; 2221 2222 ublk_batch_queue_cmd(ubq, rq, bd->last); 2223 return BLK_STS_OK; 2224} 2225 2226static inline bool ublk_belong_to_same_batch(const struct ublk_io *io, 2227 const struct ublk_io *io2) 2228{ 2229 return (io_uring_cmd_ctx_handle(io->cmd) == 2230 io_uring_cmd_ctx_handle(io2->cmd)) && 2231 (io->task == io2->task); 2232} 2233 2234static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx) 2235{ 2236 struct ublk_queue *ubq = hctx->driver_data; 2237 struct ublk_batch_fetch_cmd *fcmd; 2238 2239 spin_lock(&ubq->evts_lock); 2240 fcmd = __ublk_acquire_fcmd(ubq); 2241 spin_unlock(&ubq->evts_lock); 2242 2243 if (fcmd) 2244 io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb); 2245} 2246 2247static void ublk_queue_rqs(struct rq_list *rqlist) 2248{ 2249 struct rq_list requeue_list = { }; 2250 struct rq_list submit_list = { }; 2251 struct ublk_io *io = NULL; 2252 struct request *req; 2253 2254 while ((req = rq_list_pop(rqlist))) { 2255 struct ublk_queue *this_q = req->mq_hctx->driver_data; 2256 struct ublk_io *this_io = &this_q->ios[req->tag]; 2257 2258 if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) { 2259 rq_list_add_tail(&requeue_list, req); 2260 continue; 2261 } 2262 2263 if (io && !ublk_belong_to_same_batch(io, this_io) && 2264 !rq_list_empty(&submit_list)) 2265 ublk_queue_cmd_list(io, &submit_list); 2266 io = this_io; 2267 rq_list_add_tail(&submit_list, req); 2268 } 2269 2270 if (!rq_list_empty(&submit_list)) 2271 ublk_queue_cmd_list(io, &submit_list); 2272 *rqlist = requeue_list; 2273} 2274 2275static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) 2276{ 2277 unsigned short tags[MAX_NR_TAG]; 2278 struct ublk_batch_fetch_cmd *fcmd; 2279 struct request *rq; 2280 unsigned cnt = 0; 2281 2282 spin_lock(&ubq->evts_lock); 2283 rq_list_for_each(l, rq) { 2284 tags[cnt++] = (unsigned short)rq->tag; 2285 if (cnt >= MAX_NR_TAG) { 2286 kfifo_in(&ubq->evts_fifo, tags, cnt); 2287 cnt = 0; 2288 } 2289 } 2290 if (cnt) 2291 kfifo_in(&ubq->evts_fifo, tags, cnt); 2292 fcmd = __ublk_acquire_fcmd(ubq); 2293 spin_unlock(&ubq->evts_lock); 2294 2295 rq_list_init(l); 2296 if (fcmd) 2297 io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb); 2298} 2299 2300static void ublk_batch_queue_rqs(struct rq_list *rqlist) 2301{ 2302 struct rq_list requeue_list = { }; 2303 struct rq_list submit_list = { }; 2304 struct ublk_queue *ubq = NULL; 2305 struct request *req; 2306 2307 while ((req = rq_list_pop(rqlist))) { 2308 struct ublk_queue *this_q = req->mq_hctx->driver_data; 2309 2310 if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) { 2311 rq_list_add_tail(&requeue_list, req); 2312 continue; 2313 } 2314 2315 if (ubq && this_q != ubq && !rq_list_empty(&submit_list)) 2316 ublk_batch_queue_cmd_list(ubq, &submit_list); 2317 ubq = this_q; 2318 rq_list_add_tail(&submit_list, req); 2319 } 2320 2321 if (!rq_list_empty(&submit_list)) 2322 ublk_batch_queue_cmd_list(ubq, &submit_list); 2323 *rqlist = requeue_list; 2324} 2325 2326static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, 2327 unsigned int hctx_idx) 2328{ 2329 struct ublk_device *ub = driver_data; 2330 struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num); 2331 2332 hctx->driver_data = ubq; 2333 return 0; 2334} 2335 2336static const struct blk_mq_ops ublk_mq_ops = { 2337 .queue_rq = ublk_queue_rq, 2338 .queue_rqs = ublk_queue_rqs, 2339 .init_hctx = ublk_init_hctx, 2340 .timeout = ublk_timeout, 2341}; 2342 2343static const struct blk_mq_ops ublk_batch_mq_ops = { 2344 .commit_rqs = ublk_commit_rqs, 2345 .queue_rq = ublk_batch_queue_rq, 2346 .queue_rqs = ublk_batch_queue_rqs, 2347 .init_hctx = ublk_init_hctx, 2348 .timeout = ublk_timeout, 2349}; 2350 2351static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) 2352{ 2353 int i; 2354 2355 ubq->nr_io_ready = 0; 2356 2357 for (i = 0; i < ubq->q_depth; i++) { 2358 struct ublk_io *io = &ubq->ios[i]; 2359 2360 /* 2361 * UBLK_IO_FLAG_CANCELED is kept for avoiding to touch 2362 * io->cmd 2363 */ 2364 io->flags &= UBLK_IO_FLAG_CANCELED; 2365 io->cmd = NULL; 2366 io->buf.addr = 0; 2367 2368 /* 2369 * old task is PF_EXITING, put it now 2370 * 2371 * It could be NULL in case of closing one quiesced 2372 * device. 2373 */ 2374 if (io->task) { 2375 put_task_struct(io->task); 2376 io->task = NULL; 2377 } 2378 2379 WARN_ON_ONCE(refcount_read(&io->ref)); 2380 WARN_ON_ONCE(io->task_registered_buffers); 2381 } 2382} 2383 2384static int ublk_ch_open(struct inode *inode, struct file *filp) 2385{ 2386 struct ublk_device *ub = container_of(inode->i_cdev, 2387 struct ublk_device, cdev); 2388 2389 if (test_and_set_bit(UB_STATE_OPEN, &ub->state)) 2390 return -EBUSY; 2391 filp->private_data = ub; 2392 ub->ublksrv_tgid = current->tgid; 2393 return 0; 2394} 2395 2396static void ublk_reset_ch_dev(struct ublk_device *ub) 2397{ 2398 int i; 2399 2400 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 2401 ublk_queue_reinit(ub, ublk_get_queue(ub, i)); 2402 2403 /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */ 2404 ub->mm = NULL; 2405 ub->nr_queue_ready = 0; 2406 ub->unprivileged_daemons = false; 2407 ub->ublksrv_tgid = -1; 2408} 2409 2410static struct gendisk *ublk_get_disk(struct ublk_device *ub) 2411{ 2412 struct gendisk *disk; 2413 2414 spin_lock(&ub->lock); 2415 disk = ub->ub_disk; 2416 if (disk) 2417 get_device(disk_to_dev(disk)); 2418 spin_unlock(&ub->lock); 2419 2420 return disk; 2421} 2422 2423static void ublk_put_disk(struct gendisk *disk) 2424{ 2425 if (disk) 2426 put_device(disk_to_dev(disk)); 2427} 2428 2429static void ublk_partition_scan_work(struct work_struct *work) 2430{ 2431 struct ublk_device *ub = 2432 container_of(work, struct ublk_device, partition_scan_work); 2433 /* Hold disk reference to prevent UAF during concurrent teardown */ 2434 struct gendisk *disk = ublk_get_disk(ub); 2435 2436 if (!disk) 2437 return; 2438 2439 if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN, 2440 &disk->state))) 2441 goto out; 2442 2443 mutex_lock(&disk->open_mutex); 2444 bdev_disk_changed(disk, false); 2445 mutex_unlock(&disk->open_mutex); 2446out: 2447 ublk_put_disk(disk); 2448} 2449 2450/* 2451 * Use this function to ensure that ->canceling is consistently set for 2452 * the device and all queues. Do not set these flags directly. 2453 * 2454 * Caller must ensure that: 2455 * - cancel_mutex is held. This ensures that there is no concurrent 2456 * access to ub->canceling and no concurrent writes to ubq->canceling. 2457 * - there are no concurrent reads of ubq->canceling from the queue_rq 2458 * path. This can be done by quiescing the queue, or through other 2459 * means. 2460 */ 2461static void ublk_set_canceling(struct ublk_device *ub, bool canceling) 2462 __must_hold(&ub->cancel_mutex) 2463{ 2464 int i; 2465 2466 ub->canceling = canceling; 2467 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 2468 ublk_get_queue(ub, i)->canceling = canceling; 2469} 2470 2471static bool ublk_check_and_reset_active_ref(struct ublk_device *ub) 2472{ 2473 int i, j; 2474 2475 if (!ublk_dev_need_req_ref(ub)) 2476 return false; 2477 2478 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { 2479 struct ublk_queue *ubq = ublk_get_queue(ub, i); 2480 2481 for (j = 0; j < ubq->q_depth; j++) { 2482 struct ublk_io *io = &ubq->ios[j]; 2483 unsigned int refs = refcount_read(&io->ref) + 2484 io->task_registered_buffers; 2485 2486 /* 2487 * UBLK_REFCOUNT_INIT or zero means no active 2488 * reference 2489 */ 2490 if (refs != UBLK_REFCOUNT_INIT && refs != 0) 2491 return true; 2492 2493 /* reset to zero if the io hasn't active references */ 2494 refcount_set(&io->ref, 0); 2495 io->task_registered_buffers = 0; 2496 } 2497 } 2498 return false; 2499} 2500 2501static void ublk_ch_release_work_fn(struct work_struct *work) 2502{ 2503 struct ublk_device *ub = 2504 container_of(work, struct ublk_device, exit_work.work); 2505 struct gendisk *disk; 2506 int i; 2507 2508 /* 2509 * For zero-copy and auto buffer register modes, I/O references 2510 * might not be dropped naturally when the daemon is killed, but 2511 * io_uring guarantees that registered bvec kernel buffers are 2512 * unregistered finally when freeing io_uring context, then the 2513 * active references are dropped. 2514 * 2515 * Wait until active references are dropped for avoiding use-after-free 2516 * 2517 * registered buffer may be unregistered in io_ring's release hander, 2518 * so have to wait by scheduling work function for avoiding the two 2519 * file release dependency. 2520 */ 2521 if (ublk_check_and_reset_active_ref(ub)) { 2522 schedule_delayed_work(&ub->exit_work, 1); 2523 return; 2524 } 2525 2526 /* 2527 * disk isn't attached yet, either device isn't live, or it has 2528 * been removed already, so we needn't to do anything 2529 */ 2530 disk = ublk_get_disk(ub); 2531 if (!disk) 2532 goto out; 2533 2534 /* 2535 * All uring_cmd are done now, so abort any request outstanding to 2536 * the ublk server 2537 * 2538 * This can be done in lockless way because ublk server has been 2539 * gone 2540 * 2541 * More importantly, we have to provide forward progress guarantee 2542 * without holding ub->mutex, otherwise control task grabbing 2543 * ub->mutex triggers deadlock 2544 * 2545 * All requests may be inflight, so ->canceling may not be set, set 2546 * it now. 2547 */ 2548 mutex_lock(&ub->cancel_mutex); 2549 ublk_set_canceling(ub, true); 2550 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 2551 ublk_abort_queue(ub, ublk_get_queue(ub, i)); 2552 mutex_unlock(&ub->cancel_mutex); 2553 blk_mq_kick_requeue_list(disk->queue); 2554 2555 /* 2556 * All infligh requests have been completed or requeued and any new 2557 * request will be failed or requeued via `->canceling` now, so it is 2558 * fine to grab ub->mutex now. 2559 */ 2560 mutex_lock(&ub->mutex); 2561 2562 /* double check after grabbing lock */ 2563 if (!ub->ub_disk) 2564 goto unlock; 2565 2566 /* 2567 * Transition the device to the nosrv state. What exactly this 2568 * means depends on the recovery flags 2569 */ 2570 if (ublk_nosrv_should_stop_dev(ub)) { 2571 /* 2572 * Allow any pending/future I/O to pass through quickly 2573 * with an error. This is needed because del_gendisk 2574 * waits for all pending I/O to complete 2575 */ 2576 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 2577 WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true); 2578 2579 ublk_stop_dev_unlocked(ub); 2580 } else { 2581 if (ublk_nosrv_dev_should_queue_io(ub)) { 2582 /* ->canceling is set and all requests are aborted */ 2583 ub->dev_info.state = UBLK_S_DEV_QUIESCED; 2584 } else { 2585 ub->dev_info.state = UBLK_S_DEV_FAIL_IO; 2586 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 2587 WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true); 2588 } 2589 } 2590unlock: 2591 mutex_unlock(&ub->mutex); 2592 ublk_put_disk(disk); 2593 2594 /* all uring_cmd has been done now, reset device & ubq */ 2595 ublk_reset_ch_dev(ub); 2596out: 2597 clear_bit(UB_STATE_OPEN, &ub->state); 2598 2599 /* put the reference grabbed in ublk_ch_release() */ 2600 ublk_put_device(ub); 2601} 2602 2603static int ublk_ch_release(struct inode *inode, struct file *filp) 2604{ 2605 struct ublk_device *ub = filp->private_data; 2606 2607 /* 2608 * Grab ublk device reference, so it won't be gone until we are 2609 * really released from work function. 2610 */ 2611 ublk_get_device(ub); 2612 2613 INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn); 2614 schedule_delayed_work(&ub->exit_work, 0); 2615 return 0; 2616} 2617 2618/* map pre-allocated per-queue cmd buffer to ublksrv daemon */ 2619static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) 2620{ 2621 struct ublk_device *ub = filp->private_data; 2622 size_t sz = vma->vm_end - vma->vm_start; 2623 unsigned max_sz = ublk_max_cmd_buf_size(); 2624 unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT; 2625 int q_id, ret = 0; 2626 2627 spin_lock(&ub->lock); 2628 if (!ub->mm) 2629 ub->mm = current->mm; 2630 if (current->mm != ub->mm) 2631 ret = -EINVAL; 2632 spin_unlock(&ub->lock); 2633 2634 if (ret) 2635 return ret; 2636 2637 if (vma->vm_flags & VM_WRITE) 2638 return -EPERM; 2639 2640 end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz; 2641 if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end) 2642 return -EINVAL; 2643 2644 q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz; 2645 pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n", 2646 __func__, q_id, current->pid, vma->vm_start, 2647 phys_off, (unsigned long)sz); 2648 2649 if (sz != ublk_queue_cmd_buf_size(ub)) 2650 return -EINVAL; 2651 2652 pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT; 2653 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); 2654} 2655 2656static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io, 2657 struct request *req) 2658{ 2659 WARN_ON_ONCE(!ublk_dev_support_batch_io(ub) && 2660 io->flags & UBLK_IO_FLAG_ACTIVE); 2661 2662 if (ublk_nosrv_should_reissue_outstanding(ub)) 2663 blk_mq_requeue_request(req, false); 2664 else { 2665 io->res = -EIO; 2666 __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL); 2667 } 2668} 2669 2670/* 2671 * Request tag may just be filled to event kfifo, not get chance to 2672 * dispatch, abort these requests too 2673 */ 2674static void ublk_abort_batch_queue(struct ublk_device *ub, 2675 struct ublk_queue *ubq) 2676{ 2677 unsigned short tag; 2678 2679 while (kfifo_out(&ubq->evts_fifo, &tag, 1)) { 2680 struct request *req = blk_mq_tag_to_rq( 2681 ub->tag_set.tags[ubq->q_id], tag); 2682 2683 if (!WARN_ON_ONCE(!req || !blk_mq_request_started(req))) 2684 __ublk_fail_req(ub, &ubq->ios[tag], req); 2685 } 2686} 2687 2688/* 2689 * Called from ublk char device release handler, when any uring_cmd is 2690 * done, meantime request queue is "quiesced" since all inflight requests 2691 * can't be completed because ublk server is dead. 2692 * 2693 * So no one can hold our request IO reference any more, simply ignore the 2694 * reference, and complete the request immediately 2695 */ 2696static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) 2697{ 2698 int i; 2699 2700 for (i = 0; i < ubq->q_depth; i++) { 2701 struct ublk_io *io = &ubq->ios[i]; 2702 2703 if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) 2704 __ublk_fail_req(ub, io, io->req); 2705 } 2706 2707 if (ublk_support_batch_io(ubq)) 2708 ublk_abort_batch_queue(ub, ubq); 2709} 2710 2711static void ublk_start_cancel(struct ublk_device *ub) 2712{ 2713 struct gendisk *disk = ublk_get_disk(ub); 2714 2715 /* Our disk has been dead */ 2716 if (!disk) 2717 return; 2718 2719 mutex_lock(&ub->cancel_mutex); 2720 if (ub->canceling) 2721 goto out; 2722 /* 2723 * Now we are serialized with ublk_queue_rq() 2724 * 2725 * Make sure that ubq->canceling is set when queue is frozen, 2726 * because ublk_queue_rq() has to rely on this flag for avoiding to 2727 * touch completed uring_cmd 2728 */ 2729 blk_mq_quiesce_queue(disk->queue); 2730 ublk_set_canceling(ub, true); 2731 blk_mq_unquiesce_queue(disk->queue); 2732out: 2733 mutex_unlock(&ub->cancel_mutex); 2734 ublk_put_disk(disk); 2735} 2736 2737static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, 2738 unsigned int issue_flags) 2739{ 2740 struct ublk_io *io = &ubq->ios[tag]; 2741 struct ublk_device *ub = ubq->dev; 2742 struct request *req; 2743 bool done; 2744 2745 if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) 2746 return; 2747 2748 /* 2749 * Don't try to cancel this command if the request is started for 2750 * avoiding race between io_uring_cmd_done() and 2751 * io_uring_cmd_complete_in_task(). 2752 * 2753 * Either the started request will be aborted via __ublk_abort_rq(), 2754 * then this uring_cmd is canceled next time, or it will be done in 2755 * task work function ublk_dispatch_req() because io_uring guarantees 2756 * that ublk_dispatch_req() is always called 2757 */ 2758 req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); 2759 if (req && blk_mq_request_started(req) && req->tag == tag) 2760 return; 2761 2762 spin_lock(&ubq->cancel_lock); 2763 done = !!(io->flags & UBLK_IO_FLAG_CANCELED); 2764 if (!done) 2765 io->flags |= UBLK_IO_FLAG_CANCELED; 2766 spin_unlock(&ubq->cancel_lock); 2767 2768 if (!done) 2769 io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags); 2770} 2771 2772/* 2773 * Cancel a batch fetch command if it hasn't been claimed by another path. 2774 * 2775 * An fcmd can only be cancelled if: 2776 * 1. It's not the active_fcmd (which is currently being processed) 2777 * 2. It's still on the list (!list_empty check) - once removed from the list, 2778 * the fcmd is considered claimed and will be freed by whoever removed it 2779 * 2780 * Use list_del_init() so subsequent list_empty() checks work correctly. 2781 */ 2782static void ublk_batch_cancel_cmd(struct ublk_queue *ubq, 2783 struct ublk_batch_fetch_cmd *fcmd, 2784 unsigned int issue_flags) 2785{ 2786 bool done; 2787 2788 spin_lock(&ubq->evts_lock); 2789 done = (READ_ONCE(ubq->active_fcmd) != fcmd) && !list_empty(&fcmd->node); 2790 if (done) 2791 list_del_init(&fcmd->node); 2792 spin_unlock(&ubq->evts_lock); 2793 2794 if (done) { 2795 io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags); 2796 ublk_batch_free_fcmd(fcmd); 2797 } 2798} 2799 2800static void ublk_batch_cancel_queue(struct ublk_queue *ubq) 2801{ 2802 struct ublk_batch_fetch_cmd *fcmd; 2803 LIST_HEAD(fcmd_list); 2804 2805 spin_lock(&ubq->evts_lock); 2806 ubq->force_abort = true; 2807 list_splice_init(&ubq->fcmd_head, &fcmd_list); 2808 fcmd = READ_ONCE(ubq->active_fcmd); 2809 if (fcmd) 2810 list_move(&fcmd->node, &ubq->fcmd_head); 2811 spin_unlock(&ubq->evts_lock); 2812 2813 while (!list_empty(&fcmd_list)) { 2814 fcmd = list_first_entry(&fcmd_list, 2815 struct ublk_batch_fetch_cmd, node); 2816 ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED); 2817 } 2818} 2819 2820static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd, 2821 unsigned int issue_flags) 2822{ 2823 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 2824 struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd; 2825 struct ublk_queue *ubq = pdu->ubq; 2826 2827 ublk_start_cancel(ubq->dev); 2828 2829 ublk_batch_cancel_cmd(ubq, fcmd, issue_flags); 2830} 2831 2832/* 2833 * The ublk char device won't be closed when calling cancel fn, so both 2834 * ublk device and queue are guaranteed to be live 2835 * 2836 * Two-stage cancel: 2837 * 2838 * - make every active uring_cmd done in ->cancel_fn() 2839 * 2840 * - aborting inflight ublk IO requests in ublk char device release handler, 2841 * which depends on 1st stage because device can only be closed iff all 2842 * uring_cmd are done 2843 * 2844 * Do _not_ try to acquire ub->mutex before all inflight requests are 2845 * aborted, otherwise deadlock may be caused. 2846 */ 2847static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, 2848 unsigned int issue_flags) 2849{ 2850 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 2851 struct ublk_queue *ubq = pdu->ubq; 2852 struct task_struct *task; 2853 struct ublk_io *io; 2854 2855 if (WARN_ON_ONCE(!ubq)) 2856 return; 2857 2858 if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth)) 2859 return; 2860 2861 task = io_uring_cmd_get_task(cmd); 2862 io = &ubq->ios[pdu->tag]; 2863 if (WARN_ON_ONCE(task && task != io->task)) 2864 return; 2865 2866 ublk_start_cancel(ubq->dev); 2867 2868 WARN_ON_ONCE(io->cmd != cmd); 2869 ublk_cancel_cmd(ubq, pdu->tag, issue_flags); 2870} 2871 2872static inline bool ublk_queue_ready(const struct ublk_queue *ubq) 2873{ 2874 return ubq->nr_io_ready == ubq->q_depth; 2875} 2876 2877static inline bool ublk_dev_ready(const struct ublk_device *ub) 2878{ 2879 return ub->nr_queue_ready == ub->dev_info.nr_hw_queues; 2880} 2881 2882static void ublk_cancel_queue(struct ublk_queue *ubq) 2883{ 2884 int i; 2885 2886 if (ublk_support_batch_io(ubq)) { 2887 ublk_batch_cancel_queue(ubq); 2888 return; 2889 } 2890 2891 for (i = 0; i < ubq->q_depth; i++) 2892 ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED); 2893} 2894 2895/* Cancel all pending commands, must be called after del_gendisk() returns */ 2896static void ublk_cancel_dev(struct ublk_device *ub) 2897{ 2898 int i; 2899 2900 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 2901 ublk_cancel_queue(ublk_get_queue(ub, i)); 2902} 2903 2904static bool ublk_check_inflight_rq(struct request *rq, void *data) 2905{ 2906 bool *idle = data; 2907 2908 if (blk_mq_request_started(rq)) { 2909 *idle = false; 2910 return false; 2911 } 2912 return true; 2913} 2914 2915static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub) 2916{ 2917 bool idle; 2918 2919 WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue)); 2920 while (true) { 2921 idle = true; 2922 blk_mq_tagset_busy_iter(&ub->tag_set, 2923 ublk_check_inflight_rq, &idle); 2924 if (idle) 2925 break; 2926 msleep(UBLK_REQUEUE_DELAY_MS); 2927 } 2928} 2929 2930static void ublk_force_abort_dev(struct ublk_device *ub) 2931{ 2932 int i; 2933 2934 pr_devel("%s: force abort ub: dev_id %d state %s\n", 2935 __func__, ub->dev_info.dev_id, 2936 ub->dev_info.state == UBLK_S_DEV_LIVE ? 2937 "LIVE" : "QUIESCED"); 2938 blk_mq_quiesce_queue(ub->ub_disk->queue); 2939 if (ub->dev_info.state == UBLK_S_DEV_LIVE) 2940 ublk_wait_tagset_rqs_idle(ub); 2941 2942 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 2943 ublk_get_queue(ub, i)->force_abort = true; 2944 blk_mq_unquiesce_queue(ub->ub_disk->queue); 2945 /* We may have requeued some rqs in ublk_quiesce_queue() */ 2946 blk_mq_kick_requeue_list(ub->ub_disk->queue); 2947} 2948 2949static struct gendisk *ublk_detach_disk(struct ublk_device *ub) 2950{ 2951 struct gendisk *disk; 2952 2953 /* Sync with ublk_abort_queue() by holding the lock */ 2954 spin_lock(&ub->lock); 2955 disk = ub->ub_disk; 2956 ub->dev_info.state = UBLK_S_DEV_DEAD; 2957 ub->dev_info.ublksrv_pid = -1; 2958 ub->ub_disk = NULL; 2959 spin_unlock(&ub->lock); 2960 2961 return disk; 2962} 2963 2964static void ublk_stop_dev_unlocked(struct ublk_device *ub) 2965 __must_hold(&ub->mutex) 2966{ 2967 struct gendisk *disk; 2968 2969 if (ub->dev_info.state == UBLK_S_DEV_DEAD) 2970 return; 2971 2972 if (ublk_nosrv_dev_should_queue_io(ub)) 2973 ublk_force_abort_dev(ub); 2974 del_gendisk(ub->ub_disk); 2975 disk = ublk_detach_disk(ub); 2976 put_disk(disk); 2977} 2978 2979static void ublk_stop_dev(struct ublk_device *ub) 2980{ 2981 mutex_lock(&ub->mutex); 2982 ublk_stop_dev_unlocked(ub); 2983 mutex_unlock(&ub->mutex); 2984 cancel_work_sync(&ub->partition_scan_work); 2985 ublk_cancel_dev(ub); 2986} 2987 2988static void ublk_reset_io_flags(struct ublk_queue *ubq, struct ublk_io *io) 2989{ 2990 /* UBLK_IO_FLAG_CANCELED can be cleared now */ 2991 spin_lock(&ubq->cancel_lock); 2992 io->flags &= ~UBLK_IO_FLAG_CANCELED; 2993 spin_unlock(&ubq->cancel_lock); 2994} 2995 2996/* reset per-queue io flags */ 2997static void ublk_queue_reset_io_flags(struct ublk_queue *ubq) 2998{ 2999 spin_lock(&ubq->cancel_lock); 3000 ubq->canceling = false; 3001 spin_unlock(&ubq->cancel_lock); 3002 ubq->fail_io = false; 3003} 3004 3005/* device can only be started after all IOs are ready */ 3006static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id, 3007 struct ublk_io *io) 3008 __must_hold(&ub->mutex) 3009{ 3010 struct ublk_queue *ubq = ublk_get_queue(ub, q_id); 3011 3012 if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN)) 3013 ub->unprivileged_daemons = true; 3014 3015 ubq->nr_io_ready++; 3016 ublk_reset_io_flags(ubq, io); 3017 3018 /* Check if this specific queue is now fully ready */ 3019 if (ublk_queue_ready(ubq)) { 3020 ub->nr_queue_ready++; 3021 3022 /* 3023 * Reset queue flags as soon as this queue is ready. 3024 * This clears the canceling flag, allowing batch FETCH commands 3025 * to succeed during recovery without waiting for all queues. 3026 */ 3027 ublk_queue_reset_io_flags(ubq); 3028 } 3029 3030 /* Check if all queues are ready */ 3031 if (ublk_dev_ready(ub)) { 3032 /* 3033 * All queues ready - clear device-level canceling flag 3034 * and complete the recovery/initialization. 3035 */ 3036 mutex_lock(&ub->cancel_mutex); 3037 ub->canceling = false; 3038 mutex_unlock(&ub->cancel_mutex); 3039 complete_all(&ub->completion); 3040 } 3041} 3042 3043static inline int ublk_check_cmd_op(u32 cmd_op) 3044{ 3045 u32 ioc_type = _IOC_TYPE(cmd_op); 3046 3047 if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u') 3048 return -EOPNOTSUPP; 3049 3050 if (ioc_type != 'u' && ioc_type != 0) 3051 return -EOPNOTSUPP; 3052 3053 return 0; 3054} 3055 3056static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd) 3057{ 3058 struct ublk_auto_buf_reg buf; 3059 3060 buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr)); 3061 3062 if (buf.reserved0 || buf.reserved1) 3063 return -EINVAL; 3064 3065 if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK) 3066 return -EINVAL; 3067 io->buf.auto_reg = buf; 3068 return 0; 3069} 3070 3071static void ublk_clear_auto_buf_reg(struct ublk_io *io, 3072 struct io_uring_cmd *cmd, 3073 u16 *buf_idx) 3074{ 3075 if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) { 3076 io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG; 3077 3078 /* 3079 * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ` 3080 * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same 3081 * `io_ring_ctx`. 3082 * 3083 * If this uring_cmd's io_ring_ctx isn't same with the 3084 * one for registering the buffer, it is ublk server's 3085 * responsibility for unregistering the buffer, otherwise 3086 * this ublk request gets stuck. 3087 */ 3088 if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd)) 3089 *buf_idx = io->buf.auto_reg.index; 3090 } 3091} 3092 3093static int ublk_handle_auto_buf_reg(struct ublk_io *io, 3094 struct io_uring_cmd *cmd, 3095 u16 *buf_idx) 3096{ 3097 ublk_clear_auto_buf_reg(io, cmd, buf_idx); 3098 return ublk_set_auto_buf_reg(io, cmd); 3099} 3100 3101/* Once we return, `io->req` can't be used any more */ 3102static inline struct request * 3103ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd) 3104{ 3105 struct request *req = io->req; 3106 3107 io->cmd = cmd; 3108 io->flags |= UBLK_IO_FLAG_ACTIVE; 3109 /* now this cmd slot is owned by ublk driver */ 3110 io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; 3111 3112 return req; 3113} 3114 3115static inline int 3116ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io, 3117 struct io_uring_cmd *cmd, unsigned long buf_addr, 3118 u16 *buf_idx) 3119{ 3120 if (ublk_dev_support_auto_buf_reg(ub)) 3121 return ublk_handle_auto_buf_reg(io, cmd, buf_idx); 3122 3123 io->buf.addr = buf_addr; 3124 return 0; 3125} 3126 3127static inline void ublk_prep_cancel(struct io_uring_cmd *cmd, 3128 unsigned int issue_flags, 3129 struct ublk_queue *ubq, unsigned int tag) 3130{ 3131 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 3132 3133 /* 3134 * Safe to refer to @ubq since ublk_queue won't be died until its 3135 * commands are completed 3136 */ 3137 pdu->ubq = ubq; 3138 pdu->tag = tag; 3139 io_uring_cmd_mark_cancelable(cmd, issue_flags); 3140} 3141 3142static void ublk_io_release(void *priv) 3143{ 3144 struct request *rq = priv; 3145 struct ublk_queue *ubq = rq->mq_hctx->driver_data; 3146 struct ublk_io *io = &ubq->ios[rq->tag]; 3147 3148 /* 3149 * task_registered_buffers may be 0 if buffers were registered off task 3150 * but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ. 3151 */ 3152 if (current == io->task && io->task_registered_buffers) 3153 io->task_registered_buffers--; 3154 else 3155 ublk_put_req_ref(io, rq); 3156} 3157 3158static int ublk_register_io_buf(struct io_uring_cmd *cmd, 3159 struct ublk_device *ub, 3160 u16 q_id, u16 tag, 3161 struct ublk_io *io, 3162 unsigned int index, unsigned int issue_flags) 3163{ 3164 struct request *req; 3165 int ret; 3166 3167 if (!ublk_dev_support_zero_copy(ub)) 3168 return -EINVAL; 3169 3170 req = __ublk_check_and_get_req(ub, q_id, tag, io); 3171 if (!req) 3172 return -EINVAL; 3173 3174 ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, 3175 issue_flags); 3176 if (ret) { 3177 ublk_put_req_ref(io, req); 3178 return ret; 3179 } 3180 3181 return 0; 3182} 3183 3184static int 3185ublk_daemon_register_io_buf(struct io_uring_cmd *cmd, 3186 struct ublk_device *ub, 3187 u16 q_id, u16 tag, struct ublk_io *io, 3188 unsigned index, unsigned issue_flags) 3189{ 3190 unsigned new_registered_buffers; 3191 struct request *req = io->req; 3192 int ret; 3193 3194 /* 3195 * Ensure there are still references for ublk_sub_req_ref() to release. 3196 * If not, fall back on the thread-safe buffer registration. 3197 */ 3198 new_registered_buffers = io->task_registered_buffers + 1; 3199 if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT)) 3200 return ublk_register_io_buf(cmd, ub, q_id, tag, io, index, 3201 issue_flags); 3202 3203 if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req)) 3204 return -EINVAL; 3205 3206 ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, 3207 issue_flags); 3208 if (ret) 3209 return ret; 3210 3211 io->task_registered_buffers = new_registered_buffers; 3212 return 0; 3213} 3214 3215static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, 3216 const struct ublk_device *ub, 3217 unsigned int index, unsigned int issue_flags) 3218{ 3219 if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY)) 3220 return -EINVAL; 3221 3222 return io_buffer_unregister_bvec(cmd, index, issue_flags); 3223} 3224 3225static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr) 3226{ 3227 if (ublk_dev_need_map_io(ub)) { 3228 /* 3229 * FETCH_RQ has to provide IO buffer if NEED GET 3230 * DATA is not enabled 3231 */ 3232 if (!buf_addr && !ublk_dev_need_get_data(ub)) 3233 return -EINVAL; 3234 } else if (buf_addr) { 3235 /* User copy requires addr to be unset */ 3236 return -EINVAL; 3237 } 3238 return 0; 3239} 3240 3241static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, 3242 struct ublk_io *io, u16 q_id) 3243{ 3244 /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */ 3245 if (ublk_dev_ready(ub)) 3246 return -EBUSY; 3247 3248 /* allow each command to be FETCHed at most once */ 3249 if (io->flags & UBLK_IO_FLAG_ACTIVE) 3250 return -EINVAL; 3251 3252 WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV); 3253 3254 ublk_fill_io_cmd(io, cmd); 3255 3256 if (ublk_dev_support_batch_io(ub)) 3257 WRITE_ONCE(io->task, NULL); 3258 else 3259 WRITE_ONCE(io->task, get_task_struct(current)); 3260 3261 return 0; 3262} 3263 3264static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, 3265 struct ublk_io *io, __u64 buf_addr, u16 q_id) 3266{ 3267 int ret; 3268 3269 /* 3270 * When handling FETCH command for setting up ublk uring queue, 3271 * ub->mutex is the innermost lock, and we won't block for handling 3272 * FETCH, so it is fine even for IO_URING_F_NONBLOCK. 3273 */ 3274 mutex_lock(&ub->mutex); 3275 ret = __ublk_fetch(cmd, ub, io, q_id); 3276 if (!ret) 3277 ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL); 3278 if (!ret) 3279 ublk_mark_io_ready(ub, q_id, io); 3280 mutex_unlock(&ub->mutex); 3281 return ret; 3282} 3283 3284static int ublk_check_commit_and_fetch(const struct ublk_device *ub, 3285 struct ublk_io *io, __u64 buf_addr) 3286{ 3287 struct request *req = io->req; 3288 3289 if (ublk_dev_need_map_io(ub)) { 3290 /* 3291 * COMMIT_AND_FETCH_REQ has to provide IO buffer if 3292 * NEED GET DATA is not enabled or it is Read IO. 3293 */ 3294 if (!buf_addr && (!ublk_dev_need_get_data(ub) || 3295 req_op(req) == REQ_OP_READ)) 3296 return -EINVAL; 3297 } else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) { 3298 /* 3299 * User copy requires addr to be unset when command is 3300 * not zone append 3301 */ 3302 return -EINVAL; 3303 } 3304 3305 return 0; 3306} 3307 3308static bool ublk_need_complete_req(const struct ublk_device *ub, 3309 struct ublk_io *io) 3310{ 3311 if (ublk_dev_need_req_ref(ub)) 3312 return ublk_sub_req_ref(io); 3313 return true; 3314} 3315 3316static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io, 3317 struct request *req) 3318{ 3319 /* 3320 * We have handled UBLK_IO_NEED_GET_DATA command, 3321 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just 3322 * do the copy work. 3323 */ 3324 io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA; 3325 /* update iod->addr because ublksrv may have passed a new io buffer */ 3326 ublk_get_iod(ubq, req->tag)->addr = io->buf.addr; 3327 pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n", 3328 __func__, ubq->q_id, req->tag, io->flags, 3329 ublk_get_iod(ubq, req->tag)->addr); 3330 3331 return ublk_start_io(ubq, req, io); 3332} 3333 3334static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, 3335 unsigned int issue_flags) 3336{ 3337 /* May point to userspace-mapped memory */ 3338 const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe, 3339 struct ublksrv_io_cmd); 3340 u16 buf_idx = UBLK_INVALID_BUF_IDX; 3341 struct ublk_device *ub = cmd->file->private_data; 3342 struct ublk_queue *ubq; 3343 struct ublk_io *io = NULL; 3344 u32 cmd_op = cmd->cmd_op; 3345 u16 q_id = READ_ONCE(ub_src->q_id); 3346 u16 tag = READ_ONCE(ub_src->tag); 3347 s32 result = READ_ONCE(ub_src->result); 3348 u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */ 3349 struct request *req; 3350 int ret; 3351 bool compl; 3352 3353 WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED); 3354 3355 pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n", 3356 __func__, cmd->cmd_op, q_id, tag, result); 3357 3358 ret = ublk_check_cmd_op(cmd_op); 3359 if (ret) 3360 goto out; 3361 3362 /* 3363 * io_buffer_unregister_bvec() doesn't access the ubq or io, 3364 * so no need to validate the q_id, tag, or task 3365 */ 3366 if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF) 3367 return ublk_unregister_io_buf(cmd, ub, addr, issue_flags); 3368 3369 ret = -EINVAL; 3370 if (q_id >= ub->dev_info.nr_hw_queues) 3371 goto out; 3372 3373 ubq = ublk_get_queue(ub, q_id); 3374 3375 if (tag >= ub->dev_info.queue_depth) 3376 goto out; 3377 3378 io = &ubq->ios[tag]; 3379 /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */ 3380 if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) { 3381 ret = ublk_check_fetch_buf(ub, addr); 3382 if (ret) 3383 goto out; 3384 ret = ublk_fetch(cmd, ub, io, addr, q_id); 3385 if (ret) 3386 goto out; 3387 3388 ublk_prep_cancel(cmd, issue_flags, ubq, tag); 3389 return -EIOCBQUEUED; 3390 } 3391 3392 if (READ_ONCE(io->task) != current) { 3393 /* 3394 * ublk_register_io_buf() accesses only the io's refcount, 3395 * so can be handled on any task 3396 */ 3397 if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF) 3398 return ublk_register_io_buf(cmd, ub, q_id, tag, io, 3399 addr, issue_flags); 3400 3401 goto out; 3402 } 3403 3404 /* there is pending io cmd, something must be wrong */ 3405 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) { 3406 ret = -EBUSY; 3407 goto out; 3408 } 3409 3410 /* 3411 * ensure that the user issues UBLK_IO_NEED_GET_DATA 3412 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA. 3413 */ 3414 if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) 3415 ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) 3416 goto out; 3417 3418 switch (_IOC_NR(cmd_op)) { 3419 case UBLK_IO_REGISTER_IO_BUF: 3420 return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr, 3421 issue_flags); 3422 case UBLK_IO_COMMIT_AND_FETCH_REQ: 3423 ret = ublk_check_commit_and_fetch(ub, io, addr); 3424 if (ret) 3425 goto out; 3426 io->res = result; 3427 req = ublk_fill_io_cmd(io, cmd); 3428 ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx); 3429 if (buf_idx != UBLK_INVALID_BUF_IDX) 3430 io_buffer_unregister_bvec(cmd, buf_idx, issue_flags); 3431 compl = ublk_need_complete_req(ub, io); 3432 3433 if (req_op(req) == REQ_OP_ZONE_APPEND) 3434 req->__sector = addr; 3435 if (compl) 3436 __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL); 3437 3438 if (ret) 3439 goto out; 3440 break; 3441 case UBLK_IO_NEED_GET_DATA: 3442 /* 3443 * ublk_get_data() may fail and fallback to requeue, so keep 3444 * uring_cmd active first and prepare for handling new requeued 3445 * request 3446 */ 3447 req = ublk_fill_io_cmd(io, cmd); 3448 ret = ublk_config_io_buf(ub, io, cmd, addr, NULL); 3449 WARN_ON_ONCE(ret); 3450 if (likely(ublk_get_data(ubq, io, req))) { 3451 __ublk_prep_compl_io_cmd(io, req); 3452 return UBLK_IO_RES_OK; 3453 } 3454 break; 3455 default: 3456 goto out; 3457 } 3458 ublk_prep_cancel(cmd, issue_flags, ubq, tag); 3459 return -EIOCBQUEUED; 3460 3461 out: 3462 pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n", 3463 __func__, cmd_op, tag, ret, io ? io->flags : 0); 3464 return ret; 3465} 3466 3467static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, 3468 u16 q_id, u16 tag, struct ublk_io *io) 3469{ 3470 struct request *req; 3471 3472 /* 3473 * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ, 3474 * which would overwrite it with io->cmd 3475 */ 3476 req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag); 3477 if (!req) 3478 return NULL; 3479 3480 if (!ublk_get_req_ref(io)) 3481 return NULL; 3482 3483 if (unlikely(!blk_mq_request_started(req) || req->tag != tag)) 3484 goto fail_put; 3485 3486 if (!ublk_rq_has_data(req)) 3487 goto fail_put; 3488 3489 return req; 3490fail_put: 3491 ublk_put_req_ref(io, req); 3492 return NULL; 3493} 3494 3495static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw) 3496{ 3497 unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; 3498 struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); 3499 int ret = ublk_ch_uring_cmd_local(cmd, issue_flags); 3500 3501 if (ret != -EIOCBQUEUED) 3502 io_uring_cmd_done(cmd, ret, issue_flags); 3503} 3504 3505static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) 3506{ 3507 if (unlikely(issue_flags & IO_URING_F_CANCEL)) { 3508 ublk_uring_cmd_cancel_fn(cmd, issue_flags); 3509 return 0; 3510 } 3511 3512 /* well-implemented server won't run into unlocked */ 3513 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { 3514 io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb); 3515 return -EIOCBQUEUED; 3516 } 3517 3518 return ublk_ch_uring_cmd_local(cmd, issue_flags); 3519} 3520 3521static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc, 3522 const struct ublk_elem_header *elem) 3523{ 3524 const void *buf = elem; 3525 3526 if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) 3527 return *(const __u64 *)(buf + sizeof(*elem)); 3528 return 0; 3529} 3530 3531static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc, 3532 const struct ublk_elem_header *elem) 3533{ 3534 const void *buf = elem; 3535 3536 if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) 3537 return *(const __u64 *)(buf + sizeof(*elem) + 3538 8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)); 3539 return -1; 3540} 3541 3542static struct ublk_auto_buf_reg 3543ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc, 3544 const struct ublk_elem_header *elem) 3545{ 3546 struct ublk_auto_buf_reg reg = { 3547 .index = elem->buf_index, 3548 .flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ? 3549 UBLK_AUTO_BUF_REG_FALLBACK : 0, 3550 }; 3551 3552 return reg; 3553} 3554 3555/* 3556 * 48 can hold any type of buffer element(8, 16 and 24 bytes) because 3557 * it is the least common multiple(LCM) of 8, 16 and 24 3558 */ 3559#define UBLK_CMD_BATCH_TMP_BUF_SZ (48 * 10) 3560struct ublk_batch_io_iter { 3561 void __user *uaddr; 3562 unsigned done, total; 3563 unsigned char elem_bytes; 3564 /* copy to this buffer from user space */ 3565 unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ]; 3566}; 3567 3568static inline int 3569__ublk_walk_cmd_buf(struct ublk_queue *ubq, 3570 struct ublk_batch_io_iter *iter, 3571 const struct ublk_batch_io_data *data, 3572 unsigned bytes, 3573 int (*cb)(struct ublk_queue *q, 3574 const struct ublk_batch_io_data *data, 3575 const struct ublk_elem_header *elem)) 3576{ 3577 unsigned int i; 3578 int ret = 0; 3579 3580 for (i = 0; i < bytes; i += iter->elem_bytes) { 3581 const struct ublk_elem_header *elem = 3582 (const struct ublk_elem_header *)&iter->buf[i]; 3583 3584 if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) { 3585 ret = -EINVAL; 3586 break; 3587 } 3588 3589 ret = cb(ubq, data, elem); 3590 if (unlikely(ret)) 3591 break; 3592 } 3593 3594 iter->done += i; 3595 return ret; 3596} 3597 3598static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter, 3599 const struct ublk_batch_io_data *data, 3600 int (*cb)(struct ublk_queue *q, 3601 const struct ublk_batch_io_data *data, 3602 const struct ublk_elem_header *elem)) 3603{ 3604 struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id); 3605 int ret = 0; 3606 3607 while (iter->done < iter->total) { 3608 unsigned int len = min(sizeof(iter->buf), iter->total - iter->done); 3609 3610 if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) { 3611 pr_warn("ublk%d: read batch cmd buffer failed\n", 3612 data->ub->dev_info.dev_id); 3613 return -EFAULT; 3614 } 3615 3616 ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb); 3617 if (ret) 3618 return ret; 3619 } 3620 return 0; 3621} 3622 3623static int ublk_batch_unprep_io(struct ublk_queue *ubq, 3624 const struct ublk_batch_io_data *data, 3625 const struct ublk_elem_header *elem) 3626{ 3627 struct ublk_io *io = &ubq->ios[elem->tag]; 3628 3629 /* 3630 * If queue was ready before this decrement, it won't be anymore, 3631 * so we need to decrement the queue ready count and restore the 3632 * canceling flag to prevent new requests from being queued. 3633 */ 3634 if (ublk_queue_ready(ubq)) { 3635 data->ub->nr_queue_ready--; 3636 spin_lock(&ubq->cancel_lock); 3637 ubq->canceling = true; 3638 spin_unlock(&ubq->cancel_lock); 3639 } 3640 ubq->nr_io_ready--; 3641 3642 ublk_io_lock(io); 3643 io->flags = 0; 3644 ublk_io_unlock(io); 3645 return 0; 3646} 3647 3648static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter, 3649 const struct ublk_batch_io_data *data) 3650{ 3651 int ret; 3652 3653 /* Re-process only what we've already processed, starting from beginning */ 3654 iter->total = iter->done; 3655 iter->done = 0; 3656 3657 ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io); 3658 WARN_ON_ONCE(ret); 3659} 3660 3661static int ublk_batch_prep_io(struct ublk_queue *ubq, 3662 const struct ublk_batch_io_data *data, 3663 const struct ublk_elem_header *elem) 3664{ 3665 struct ublk_io *io = &ubq->ios[elem->tag]; 3666 const struct ublk_batch_io *uc = &data->header; 3667 union ublk_io_buf buf = { 0 }; 3668 int ret; 3669 3670 if (ublk_dev_support_auto_buf_reg(data->ub)) 3671 buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem); 3672 else if (ublk_dev_need_map_io(data->ub)) { 3673 buf.addr = ublk_batch_buf_addr(uc, elem); 3674 3675 ret = ublk_check_fetch_buf(data->ub, buf.addr); 3676 if (ret) 3677 return ret; 3678 } 3679 3680 ublk_io_lock(io); 3681 ret = __ublk_fetch(data->cmd, data->ub, io, ubq->q_id); 3682 if (!ret) 3683 io->buf = buf; 3684 ublk_io_unlock(io); 3685 3686 if (!ret) 3687 ublk_mark_io_ready(data->ub, ubq->q_id, io); 3688 3689 return ret; 3690} 3691 3692static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data) 3693{ 3694 const struct ublk_batch_io *uc = &data->header; 3695 struct io_uring_cmd *cmd = data->cmd; 3696 struct ublk_batch_io_iter iter = { 3697 .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)), 3698 .total = uc->nr_elem * uc->elem_bytes, 3699 .elem_bytes = uc->elem_bytes, 3700 }; 3701 int ret; 3702 3703 mutex_lock(&data->ub->mutex); 3704 ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io); 3705 3706 if (ret && iter.done) 3707 ublk_batch_revert_prep_cmd(&iter, data); 3708 mutex_unlock(&data->ub->mutex); 3709 return ret; 3710} 3711 3712static int ublk_batch_commit_io_check(const struct ublk_queue *ubq, 3713 struct ublk_io *io, 3714 union ublk_io_buf *buf) 3715{ 3716 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) 3717 return -EBUSY; 3718 3719 /* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */ 3720 if (ublk_need_map_io(ubq) && !buf->addr) 3721 return -EINVAL; 3722 return 0; 3723} 3724 3725static int ublk_batch_commit_io(struct ublk_queue *ubq, 3726 const struct ublk_batch_io_data *data, 3727 const struct ublk_elem_header *elem) 3728{ 3729 struct ublk_io *io = &ubq->ios[elem->tag]; 3730 const struct ublk_batch_io *uc = &data->header; 3731 u16 buf_idx = UBLK_INVALID_BUF_IDX; 3732 union ublk_io_buf buf = { 0 }; 3733 struct request *req = NULL; 3734 bool auto_reg = false; 3735 bool compl = false; 3736 int ret; 3737 3738 if (ublk_dev_support_auto_buf_reg(data->ub)) { 3739 buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem); 3740 auto_reg = true; 3741 } else if (ublk_dev_need_map_io(data->ub)) 3742 buf.addr = ublk_batch_buf_addr(uc, elem); 3743 3744 ublk_io_lock(io); 3745 ret = ublk_batch_commit_io_check(ubq, io, &buf); 3746 if (!ret) { 3747 io->res = elem->result; 3748 io->buf = buf; 3749 req = ublk_fill_io_cmd(io, data->cmd); 3750 3751 if (auto_reg) 3752 ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx); 3753 compl = ublk_need_complete_req(data->ub, io); 3754 } 3755 ublk_io_unlock(io); 3756 3757 if (unlikely(ret)) { 3758 pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n", 3759 __func__, data->ub->dev_info.dev_id, ubq->q_id, 3760 elem->tag, ret); 3761 return ret; 3762 } 3763 3764 if (buf_idx != UBLK_INVALID_BUF_IDX) 3765 io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags); 3766 if (req_op(req) == REQ_OP_ZONE_APPEND) 3767 req->__sector = ublk_batch_zone_lba(uc, elem); 3768 if (compl) 3769 __ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub), data->iob); 3770 return 0; 3771} 3772 3773static int ublk_handle_batch_commit_cmd(struct ublk_batch_io_data *data) 3774{ 3775 const struct ublk_batch_io *uc = &data->header; 3776 struct io_uring_cmd *cmd = data->cmd; 3777 struct ublk_batch_io_iter iter = { 3778 .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)), 3779 .total = uc->nr_elem * uc->elem_bytes, 3780 .elem_bytes = uc->elem_bytes, 3781 }; 3782 DEFINE_IO_COMP_BATCH(iob); 3783 int ret; 3784 3785 data->iob = &iob; 3786 ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io); 3787 3788 if (iob.complete) 3789 iob.complete(&iob); 3790 3791 return iter.done == 0 ? ret : iter.done; 3792} 3793 3794static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc) 3795{ 3796 unsigned elem_bytes = sizeof(struct ublk_elem_header); 3797 3798 if (uc->flags & ~UBLK_BATCH_F_ALL) 3799 return -EINVAL; 3800 3801 /* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */ 3802 if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) && 3803 (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)) 3804 return -EINVAL; 3805 3806 elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) + 3807 (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0); 3808 if (uc->elem_bytes != elem_bytes) 3809 return -EINVAL; 3810 return 0; 3811} 3812 3813static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data) 3814{ 3815 const struct ublk_batch_io *uc = &data->header; 3816 3817 if (uc->q_id >= data->ub->dev_info.nr_hw_queues) 3818 return -EINVAL; 3819 3820 if (uc->nr_elem > data->ub->dev_info.queue_depth) 3821 return -E2BIG; 3822 3823 if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) && 3824 !ublk_dev_is_zoned(data->ub)) 3825 return -EINVAL; 3826 3827 if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) && 3828 !ublk_dev_need_map_io(data->ub)) 3829 return -EINVAL; 3830 3831 if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) && 3832 !ublk_dev_support_auto_buf_reg(data->ub)) 3833 return -EINVAL; 3834 3835 return ublk_check_batch_cmd_flags(uc); 3836} 3837 3838static int ublk_batch_attach(struct ublk_queue *ubq, 3839 struct ublk_batch_io_data *data, 3840 struct ublk_batch_fetch_cmd *fcmd) 3841{ 3842 struct ublk_batch_fetch_cmd *new_fcmd = NULL; 3843 bool free = false; 3844 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd); 3845 3846 spin_lock(&ubq->evts_lock); 3847 if (unlikely(ubq->force_abort || ubq->canceling)) { 3848 free = true; 3849 } else { 3850 list_add_tail(&fcmd->node, &ubq->fcmd_head); 3851 new_fcmd = __ublk_acquire_fcmd(ubq); 3852 } 3853 spin_unlock(&ubq->evts_lock); 3854 3855 if (unlikely(free)) { 3856 ublk_batch_free_fcmd(fcmd); 3857 return -ENODEV; 3858 } 3859 3860 pdu->ubq = ubq; 3861 pdu->fcmd = fcmd; 3862 io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags); 3863 3864 if (!new_fcmd) 3865 goto out; 3866 3867 /* 3868 * If the two fetch commands are originated from same io_ring_ctx, 3869 * run batch dispatch directly. Otherwise, schedule task work for 3870 * doing it. 3871 */ 3872 if (io_uring_cmd_ctx_handle(new_fcmd->cmd) == 3873 io_uring_cmd_ctx_handle(fcmd->cmd)) { 3874 data->cmd = new_fcmd->cmd; 3875 ublk_batch_dispatch(ubq, data, new_fcmd); 3876 } else { 3877 io_uring_cmd_complete_in_task(new_fcmd->cmd, 3878 ublk_batch_tw_cb); 3879 } 3880out: 3881 return -EIOCBQUEUED; 3882} 3883 3884static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data) 3885{ 3886 struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id); 3887 struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd); 3888 3889 if (!fcmd) 3890 return -ENOMEM; 3891 3892 return ublk_batch_attach(ubq, data, fcmd); 3893} 3894 3895static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data) 3896{ 3897 const struct ublk_batch_io *uc = &data->header; 3898 3899 if (uc->q_id >= data->ub->dev_info.nr_hw_queues) 3900 return -EINVAL; 3901 3902 if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT)) 3903 return -EINVAL; 3904 3905 if (uc->elem_bytes != sizeof(__u16)) 3906 return -EINVAL; 3907 3908 if (uc->flags != 0) 3909 return -EINVAL; 3910 3911 return 0; 3912} 3913 3914static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd, 3915 unsigned int issue_flags) 3916{ 3917 const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe, 3918 struct ublksrv_io_cmd); 3919 struct ublk_device *ub = cmd->file->private_data; 3920 unsigned tag = READ_ONCE(ub_cmd->tag); 3921 unsigned q_id = READ_ONCE(ub_cmd->q_id); 3922 unsigned index = READ_ONCE(ub_cmd->addr); 3923 struct ublk_queue *ubq; 3924 struct ublk_io *io; 3925 3926 if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF) 3927 return ublk_unregister_io_buf(cmd, ub, index, issue_flags); 3928 3929 if (q_id >= ub->dev_info.nr_hw_queues) 3930 return -EINVAL; 3931 3932 if (tag >= ub->dev_info.queue_depth) 3933 return -EINVAL; 3934 3935 if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF) 3936 return -EOPNOTSUPP; 3937 3938 ubq = ublk_get_queue(ub, q_id); 3939 io = &ubq->ios[tag]; 3940 return ublk_register_io_buf(cmd, ub, q_id, tag, io, index, 3941 issue_flags); 3942} 3943 3944static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, 3945 unsigned int issue_flags) 3946{ 3947 const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe, 3948 struct ublk_batch_io); 3949 struct ublk_device *ub = cmd->file->private_data; 3950 struct ublk_batch_io_data data = { 3951 .ub = ub, 3952 .cmd = cmd, 3953 .header = (struct ublk_batch_io) { 3954 .q_id = READ_ONCE(uc->q_id), 3955 .flags = READ_ONCE(uc->flags), 3956 .nr_elem = READ_ONCE(uc->nr_elem), 3957 .elem_bytes = READ_ONCE(uc->elem_bytes), 3958 }, 3959 .issue_flags = issue_flags, 3960 }; 3961 u32 cmd_op = cmd->cmd_op; 3962 int ret = -EINVAL; 3963 3964 if (unlikely(issue_flags & IO_URING_F_CANCEL)) { 3965 ublk_batch_cancel_fn(cmd, issue_flags); 3966 return 0; 3967 } 3968 3969 switch (cmd_op) { 3970 case UBLK_U_IO_PREP_IO_CMDS: 3971 ret = ublk_check_batch_cmd(&data); 3972 if (ret) 3973 goto out; 3974 ret = ublk_handle_batch_prep_cmd(&data); 3975 break; 3976 case UBLK_U_IO_COMMIT_IO_CMDS: 3977 ret = ublk_check_batch_cmd(&data); 3978 if (ret) 3979 goto out; 3980 ret = ublk_handle_batch_commit_cmd(&data); 3981 break; 3982 case UBLK_U_IO_FETCH_IO_CMDS: 3983 ret = ublk_validate_batch_fetch_cmd(&data); 3984 if (ret) 3985 goto out; 3986 ret = ublk_handle_batch_fetch_cmd(&data); 3987 break; 3988 default: 3989 ret = ublk_handle_non_batch_cmd(cmd, issue_flags); 3990 break; 3991 } 3992out: 3993 return ret; 3994} 3995 3996static inline bool ublk_check_ubuf_dir(const struct request *req, 3997 int ubuf_dir) 3998{ 3999 /* copy ubuf to request pages */ 4000 if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) && 4001 ubuf_dir == ITER_SOURCE) 4002 return true; 4003 4004 /* copy request pages to ubuf */ 4005 if ((req_op(req) == REQ_OP_WRITE || 4006 req_op(req) == REQ_OP_ZONE_APPEND) && 4007 ubuf_dir == ITER_DEST) 4008 return true; 4009 4010 return false; 4011} 4012 4013static ssize_t 4014ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir) 4015{ 4016 struct ublk_device *ub = iocb->ki_filp->private_data; 4017 struct ublk_queue *ubq; 4018 struct request *req; 4019 struct ublk_io *io; 4020 unsigned data_len; 4021 bool is_integrity; 4022 bool on_daemon; 4023 size_t buf_off; 4024 u16 tag, q_id; 4025 ssize_t ret; 4026 4027 if (!user_backed_iter(iter)) 4028 return -EACCES; 4029 4030 if (ub->dev_info.state == UBLK_S_DEV_DEAD) 4031 return -EACCES; 4032 4033 tag = ublk_pos_to_tag(iocb->ki_pos); 4034 q_id = ublk_pos_to_hwq(iocb->ki_pos); 4035 buf_off = ublk_pos_to_buf_off(iocb->ki_pos); 4036 is_integrity = !!(iocb->ki_pos & UBLKSRV_IO_INTEGRITY_FLAG); 4037 4038 if (unlikely(!ublk_dev_support_integrity(ub) && is_integrity)) 4039 return -EINVAL; 4040 4041 if (q_id >= ub->dev_info.nr_hw_queues) 4042 return -EINVAL; 4043 4044 ubq = ublk_get_queue(ub, q_id); 4045 if (!ublk_dev_support_user_copy(ub)) 4046 return -EACCES; 4047 4048 if (tag >= ub->dev_info.queue_depth) 4049 return -EINVAL; 4050 4051 io = &ubq->ios[tag]; 4052 on_daemon = current == READ_ONCE(io->task); 4053 if (on_daemon) { 4054 /* On daemon, io can't be completed concurrently, so skip ref */ 4055 if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) 4056 return -EINVAL; 4057 4058 req = io->req; 4059 if (!ublk_rq_has_data(req)) 4060 return -EINVAL; 4061 } else { 4062 req = __ublk_check_and_get_req(ub, q_id, tag, io); 4063 if (!req) 4064 return -EINVAL; 4065 } 4066 4067 if (is_integrity) { 4068 struct blk_integrity *bi = &req->q->limits.integrity; 4069 4070 data_len = bio_integrity_bytes(bi, blk_rq_sectors(req)); 4071 } else { 4072 data_len = blk_rq_bytes(req); 4073 } 4074 if (buf_off > data_len) { 4075 ret = -EINVAL; 4076 goto out; 4077 } 4078 4079 if (!ublk_check_ubuf_dir(req, dir)) { 4080 ret = -EACCES; 4081 goto out; 4082 } 4083 4084 if (is_integrity) 4085 ret = ublk_copy_user_integrity(req, buf_off, iter, dir); 4086 else 4087 ret = ublk_copy_user_pages(req, buf_off, iter, dir); 4088 4089out: 4090 if (!on_daemon) 4091 ublk_put_req_ref(io, req); 4092 return ret; 4093} 4094 4095static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to) 4096{ 4097 return ublk_user_copy(iocb, to, ITER_DEST); 4098} 4099 4100static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from) 4101{ 4102 return ublk_user_copy(iocb, from, ITER_SOURCE); 4103} 4104 4105static const struct file_operations ublk_ch_fops = { 4106 .owner = THIS_MODULE, 4107 .open = ublk_ch_open, 4108 .release = ublk_ch_release, 4109 .read_iter = ublk_ch_read_iter, 4110 .write_iter = ublk_ch_write_iter, 4111 .uring_cmd = ublk_ch_uring_cmd, 4112 .mmap = ublk_ch_mmap, 4113}; 4114 4115static const struct file_operations ublk_ch_batch_io_fops = { 4116 .owner = THIS_MODULE, 4117 .open = ublk_ch_open, 4118 .release = ublk_ch_release, 4119 .read_iter = ublk_ch_read_iter, 4120 .write_iter = ublk_ch_write_iter, 4121 .uring_cmd = ublk_ch_batch_io_uring_cmd, 4122 .mmap = ublk_ch_mmap, 4123}; 4124 4125static void __ublk_deinit_queue(struct ublk_device *ub, struct ublk_queue *ubq) 4126{ 4127 int size, i; 4128 4129 size = ublk_queue_cmd_buf_size(ub); 4130 4131 for (i = 0; i < ubq->q_depth; i++) { 4132 struct ublk_io *io = &ubq->ios[i]; 4133 if (io->task) 4134 put_task_struct(io->task); 4135 WARN_ON_ONCE(refcount_read(&io->ref)); 4136 WARN_ON_ONCE(io->task_registered_buffers); 4137 } 4138 4139 if (ubq->io_cmd_buf) 4140 free_pages((unsigned long)ubq->io_cmd_buf, get_order(size)); 4141 4142 if (ublk_dev_support_batch_io(ub)) 4143 ublk_io_evts_deinit(ubq); 4144 4145 kvfree(ubq); 4146} 4147 4148static void ublk_deinit_queue(struct ublk_device *ub, int q_id) 4149{ 4150 struct ublk_queue *ubq = ub->queues[q_id]; 4151 4152 if (!ubq) 4153 return; 4154 4155 __ublk_deinit_queue(ub, ubq); 4156 ub->queues[q_id] = NULL; 4157} 4158 4159static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id) 4160{ 4161 unsigned int cpu; 4162 4163 /* Find first CPU mapped to this queue */ 4164 for_each_possible_cpu(cpu) { 4165 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id) 4166 return cpu_to_node(cpu); 4167 } 4168 4169 return NUMA_NO_NODE; 4170} 4171 4172static int ublk_init_queue(struct ublk_device *ub, int q_id) 4173{ 4174 int depth = ub->dev_info.queue_depth; 4175 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO; 4176 struct ublk_queue *ubq; 4177 struct page *page; 4178 int numa_node; 4179 int size, i, ret; 4180 4181 /* Determine NUMA node based on queue's CPU affinity */ 4182 numa_node = ublk_get_queue_numa_node(ub, q_id); 4183 4184 /* Allocate queue structure on local NUMA node */ 4185 ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL, 4186 numa_node); 4187 if (!ubq) 4188 return -ENOMEM; 4189 4190 spin_lock_init(&ubq->cancel_lock); 4191 ubq->flags = ub->dev_info.flags; 4192 ubq->q_id = q_id; 4193 ubq->q_depth = depth; 4194 size = ublk_queue_cmd_buf_size(ub); 4195 4196 /* Allocate I/O command buffer on local NUMA node */ 4197 page = alloc_pages_node(numa_node, gfp_flags, get_order(size)); 4198 if (!page) { 4199 kvfree(ubq); 4200 return -ENOMEM; 4201 } 4202 ubq->io_cmd_buf = page_address(page); 4203 4204 for (i = 0; i < ubq->q_depth; i++) 4205 spin_lock_init(&ubq->ios[i].lock); 4206 4207 if (ublk_dev_support_batch_io(ub)) { 4208 ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node); 4209 if (ret) 4210 goto fail; 4211 INIT_LIST_HEAD(&ubq->fcmd_head); 4212 } 4213 ub->queues[q_id] = ubq; 4214 ubq->dev = ub; 4215 4216 return 0; 4217fail: 4218 __ublk_deinit_queue(ub, ubq); 4219 return ret; 4220} 4221 4222static void ublk_deinit_queues(struct ublk_device *ub) 4223{ 4224 int i; 4225 4226 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) 4227 ublk_deinit_queue(ub, i); 4228} 4229 4230static int ublk_init_queues(struct ublk_device *ub) 4231{ 4232 int i, ret; 4233 4234 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { 4235 ret = ublk_init_queue(ub, i); 4236 if (ret) 4237 goto fail; 4238 } 4239 4240 init_completion(&ub->completion); 4241 return 0; 4242 4243 fail: 4244 ublk_deinit_queues(ub); 4245 return ret; 4246} 4247 4248static int ublk_alloc_dev_number(struct ublk_device *ub, int idx) 4249{ 4250 int i = idx; 4251 int err; 4252 4253 spin_lock(&ublk_idr_lock); 4254 /* allocate id, if @id >= 0, we're requesting that specific id */ 4255 if (i >= 0) { 4256 err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT); 4257 if (err == -ENOSPC) 4258 err = -EEXIST; 4259 } else { 4260 err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS, 4261 GFP_NOWAIT); 4262 } 4263 spin_unlock(&ublk_idr_lock); 4264 4265 if (err >= 0) 4266 ub->ub_number = err; 4267 4268 return err; 4269} 4270 4271static void ublk_free_dev_number(struct ublk_device *ub) 4272{ 4273 spin_lock(&ublk_idr_lock); 4274 idr_remove(&ublk_index_idr, ub->ub_number); 4275 wake_up_all(&ublk_idr_wq); 4276 spin_unlock(&ublk_idr_lock); 4277} 4278 4279static void ublk_cdev_rel(struct device *dev) 4280{ 4281 struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev); 4282 4283 ublk_buf_cleanup(ub); 4284 blk_mq_free_tag_set(&ub->tag_set); 4285 ublk_deinit_queues(ub); 4286 ublk_free_dev_number(ub); 4287 mutex_destroy(&ub->mutex); 4288 mutex_destroy(&ub->cancel_mutex); 4289 kfree(ub); 4290} 4291 4292static int ublk_add_chdev(struct ublk_device *ub) 4293{ 4294 struct device *dev = &ub->cdev_dev; 4295 int minor = ub->ub_number; 4296 int ret; 4297 4298 dev->parent = ublk_misc.this_device; 4299 dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor); 4300 dev->class = &ublk_chr_class; 4301 dev->release = ublk_cdev_rel; 4302 device_initialize(dev); 4303 4304 ret = dev_set_name(dev, "ublkc%d", minor); 4305 if (ret) 4306 goto fail; 4307 4308 if (ublk_dev_support_batch_io(ub)) 4309 cdev_init(&ub->cdev, &ublk_ch_batch_io_fops); 4310 else 4311 cdev_init(&ub->cdev, &ublk_ch_fops); 4312 ret = cdev_device_add(&ub->cdev, dev); 4313 if (ret) 4314 goto fail; 4315 4316 if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) 4317 unprivileged_ublks_added++; 4318 return 0; 4319 fail: 4320 put_device(dev); 4321 return ret; 4322} 4323 4324/* align max io buffer size with PAGE_SIZE */ 4325static void ublk_align_max_io_size(struct ublk_device *ub) 4326{ 4327 unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes; 4328 4329 ub->dev_info.max_io_buf_bytes = 4330 round_down(max_io_bytes, PAGE_SIZE); 4331} 4332 4333static int ublk_add_tag_set(struct ublk_device *ub) 4334{ 4335 if (ublk_dev_support_batch_io(ub)) 4336 ub->tag_set.ops = &ublk_batch_mq_ops; 4337 else 4338 ub->tag_set.ops = &ublk_mq_ops; 4339 ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; 4340 ub->tag_set.queue_depth = ub->dev_info.queue_depth; 4341 ub->tag_set.numa_node = NUMA_NO_NODE; 4342 ub->tag_set.driver_data = ub; 4343 return blk_mq_alloc_tag_set(&ub->tag_set); 4344} 4345 4346static void ublk_remove(struct ublk_device *ub) 4347{ 4348 bool unprivileged; 4349 4350 ublk_stop_dev(ub); 4351 cdev_device_del(&ub->cdev, &ub->cdev_dev); 4352 unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV; 4353 ublk_put_device(ub); 4354 4355 if (unprivileged) 4356 unprivileged_ublks_added--; 4357} 4358 4359static struct ublk_device *ublk_get_device_from_id(int idx) 4360{ 4361 struct ublk_device *ub = NULL; 4362 4363 if (idx < 0) 4364 return NULL; 4365 4366 spin_lock(&ublk_idr_lock); 4367 ub = idr_find(&ublk_index_idr, idx); 4368 if (ub) 4369 ub = ublk_get_device(ub); 4370 spin_unlock(&ublk_idr_lock); 4371 4372 return ub; 4373} 4374 4375static bool ublk_validate_user_pid(struct ublk_device *ub, pid_t ublksrv_pid) 4376{ 4377 rcu_read_lock(); 4378 ublksrv_pid = pid_nr(find_vpid(ublksrv_pid)); 4379 rcu_read_unlock(); 4380 4381 return ub->ublksrv_tgid == ublksrv_pid; 4382} 4383 4384static int ublk_ctrl_start_dev(struct ublk_device *ub, 4385 const struct ublksrv_ctrl_cmd *header) 4386{ 4387 const struct ublk_param_basic *p = &ub->params.basic; 4388 int ublksrv_pid = (int)header->data[0]; 4389 struct queue_limits lim = { 4390 .logical_block_size = 1 << p->logical_bs_shift, 4391 .physical_block_size = 1 << p->physical_bs_shift, 4392 .io_min = 1 << p->io_min_shift, 4393 .io_opt = 1 << p->io_opt_shift, 4394 .max_hw_sectors = p->max_sectors, 4395 .chunk_sectors = p->chunk_sectors, 4396 .virt_boundary_mask = p->virt_boundary_mask, 4397 .max_segments = USHRT_MAX, 4398 .max_segment_size = UINT_MAX, 4399 .dma_alignment = 3, 4400 }; 4401 struct gendisk *disk; 4402 int ret = -EINVAL; 4403 4404 if (ublksrv_pid <= 0) 4405 return -EINVAL; 4406 if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC)) 4407 return -EINVAL; 4408 4409 if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) { 4410 const struct ublk_param_discard *pd = &ub->params.discard; 4411 4412 lim.discard_alignment = pd->discard_alignment; 4413 lim.discard_granularity = pd->discard_granularity; 4414 lim.max_hw_discard_sectors = pd->max_discard_sectors; 4415 lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors; 4416 lim.max_discard_segments = pd->max_discard_segments; 4417 } 4418 4419 if (ub->params.types & UBLK_PARAM_TYPE_ZONED) { 4420 const struct ublk_param_zoned *p = &ub->params.zoned; 4421 4422 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) 4423 return -EOPNOTSUPP; 4424 4425 lim.features |= BLK_FEAT_ZONED; 4426 lim.max_active_zones = p->max_active_zones; 4427 lim.max_open_zones = p->max_open_zones; 4428 lim.max_hw_zone_append_sectors = p->max_zone_append_sectors; 4429 } 4430 4431 if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) { 4432 lim.features |= BLK_FEAT_WRITE_CACHE; 4433 if (ub->params.basic.attrs & UBLK_ATTR_FUA) 4434 lim.features |= BLK_FEAT_FUA; 4435 } 4436 4437 if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL) 4438 lim.features |= BLK_FEAT_ROTATIONAL; 4439 4440 if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) 4441 lim.dma_alignment = ub->params.dma.alignment; 4442 4443 if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) { 4444 lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask; 4445 lim.max_segment_size = ub->params.seg.max_segment_size; 4446 lim.max_segments = ub->params.seg.max_segments; 4447 } 4448 4449 if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) { 4450 const struct ublk_param_integrity *p = &ub->params.integrity; 4451 int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type); 4452 4453 lim.max_integrity_segments = 4454 p->max_integrity_segments ?: USHRT_MAX; 4455 lim.integrity = (struct blk_integrity) { 4456 .flags = ublk_integrity_flags(p->flags), 4457 .csum_type = ublk_integrity_csum_type(p->csum_type), 4458 .metadata_size = p->metadata_size, 4459 .pi_offset = p->pi_offset, 4460 .interval_exp = p->interval_exp, 4461 .tag_size = p->tag_size, 4462 .pi_tuple_size = pi_tuple_size, 4463 }; 4464 } 4465 4466 if (wait_for_completion_interruptible(&ub->completion) != 0) 4467 return -EINTR; 4468 4469 if (!ublk_validate_user_pid(ub, ublksrv_pid)) 4470 return -EINVAL; 4471 4472 mutex_lock(&ub->mutex); 4473 /* device may become not ready in case of F_BATCH */ 4474 if (!ublk_dev_ready(ub)) { 4475 ret = -EINVAL; 4476 goto out_unlock; 4477 } 4478 if (ub->dev_info.state == UBLK_S_DEV_LIVE || 4479 test_bit(UB_STATE_USED, &ub->state)) { 4480 ret = -EEXIST; 4481 goto out_unlock; 4482 } 4483 4484 disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL); 4485 if (IS_ERR(disk)) { 4486 ret = PTR_ERR(disk); 4487 goto out_unlock; 4488 } 4489 sprintf(disk->disk_name, "ublkb%d", ub->ub_number); 4490 disk->fops = &ub_fops; 4491 disk->private_data = ub; 4492 4493 ub->dev_info.ublksrv_pid = ub->ublksrv_tgid; 4494 ub->ub_disk = disk; 4495 4496 ublk_apply_params(ub); 4497 4498 /* 4499 * Suppress partition scan to avoid potential IO hang. 4500 * 4501 * If ublk server error occurs during partition scan, the IO may 4502 * wait while holding ub->mutex, which can deadlock with other 4503 * operations that need the mutex. Defer partition scan to async 4504 * work. 4505 * For unprivileged daemons, keep GD_SUPPRESS_PART_SCAN set 4506 * permanently. 4507 */ 4508 set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); 4509 4510 ublk_get_device(ub); 4511 ub->dev_info.state = UBLK_S_DEV_LIVE; 4512 4513 if (ublk_dev_is_zoned(ub)) { 4514 ret = ublk_revalidate_disk_zones(ub); 4515 if (ret) 4516 goto out_put_cdev; 4517 } 4518 4519 ret = add_disk(disk); 4520 if (ret) 4521 goto out_put_cdev; 4522 4523 set_bit(UB_STATE_USED, &ub->state); 4524 4525 /* Skip partition scan if disabled by user */ 4526 if (ub->dev_info.flags & UBLK_F_NO_AUTO_PART_SCAN) { 4527 /* Not clear for unprivileged daemons, see comment above */ 4528 if (!ub->unprivileged_daemons) 4529 clear_bit(GD_SUPPRESS_PART_SCAN, &disk->state); 4530 } else { 4531 /* Schedule async partition scan for trusted daemons */ 4532 if (!ub->unprivileged_daemons) 4533 schedule_work(&ub->partition_scan_work); 4534 } 4535 4536out_put_cdev: 4537 if (ret) { 4538 ublk_detach_disk(ub); 4539 ublk_put_device(ub); 4540 } 4541 if (ret) 4542 put_disk(disk); 4543out_unlock: 4544 mutex_unlock(&ub->mutex); 4545 return ret; 4546} 4547 4548static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub, 4549 const struct ublksrv_ctrl_cmd *header) 4550{ 4551 void __user *argp = (void __user *)(unsigned long)header->addr; 4552 cpumask_var_t cpumask; 4553 unsigned long queue; 4554 unsigned int retlen; 4555 unsigned int i; 4556 int ret; 4557 4558 if (header->len * BITS_PER_BYTE < nr_cpu_ids) 4559 return -EINVAL; 4560 if (header->len & (sizeof(unsigned long)-1)) 4561 return -EINVAL; 4562 if (!header->addr) 4563 return -EINVAL; 4564 4565 queue = header->data[0]; 4566 if (queue >= ub->dev_info.nr_hw_queues) 4567 return -EINVAL; 4568 4569 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) 4570 return -ENOMEM; 4571 4572 for_each_possible_cpu(i) { 4573 if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue) 4574 cpumask_set_cpu(i, cpumask); 4575 } 4576 4577 ret = -EFAULT; 4578 retlen = min_t(unsigned short, header->len, cpumask_size()); 4579 if (copy_to_user(argp, cpumask, retlen)) 4580 goto out_free_cpumask; 4581 if (retlen != header->len && 4582 clear_user(argp + retlen, header->len - retlen)) 4583 goto out_free_cpumask; 4584 4585 ret = 0; 4586out_free_cpumask: 4587 free_cpumask_var(cpumask); 4588 return ret; 4589} 4590 4591static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info) 4592{ 4593 pr_devel("%s: dev id %d flags %llx\n", __func__, 4594 info->dev_id, info->flags); 4595 pr_devel("\t nr_hw_queues %d queue_depth %d\n", 4596 info->nr_hw_queues, info->queue_depth); 4597} 4598 4599static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) 4600{ 4601 void __user *argp = (void __user *)(unsigned long)header->addr; 4602 struct ublksrv_ctrl_dev_info info; 4603 struct ublk_device *ub; 4604 int ret = -EINVAL; 4605 4606 if (header->len < sizeof(info) || !header->addr) 4607 return -EINVAL; 4608 if (header->queue_id != (u16)-1) { 4609 pr_warn("%s: queue_id is wrong %x\n", 4610 __func__, header->queue_id); 4611 return -EINVAL; 4612 } 4613 4614 if (copy_from_user(&info, argp, sizeof(info))) 4615 return -EFAULT; 4616 4617 if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth || 4618 info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues) 4619 return -EINVAL; 4620 4621 if (capable(CAP_SYS_ADMIN)) 4622 info.flags &= ~UBLK_F_UNPRIVILEGED_DEV; 4623 else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV)) 4624 return -EPERM; 4625 4626 /* forbid nonsense combinations of recovery flags */ 4627 switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) { 4628 case 0: 4629 case UBLK_F_USER_RECOVERY: 4630 case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE): 4631 case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO): 4632 break; 4633 default: 4634 pr_warn("%s: invalid recovery flags %llx\n", __func__, 4635 info.flags & UBLK_F_ALL_RECOVERY_FLAGS); 4636 return -EINVAL; 4637 } 4638 4639 if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) { 4640 pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n"); 4641 return -EINVAL; 4642 } 4643 4644 /* 4645 * unprivileged device can't be trusted, but RECOVERY and 4646 * RECOVERY_REISSUE still may hang error handling, so can't 4647 * support recovery features for unprivileged ublk now 4648 * 4649 * TODO: provide forward progress for RECOVERY handler, so that 4650 * unprivileged device can benefit from it 4651 */ 4652 if (info.flags & UBLK_F_UNPRIVILEGED_DEV) { 4653 info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE | 4654 UBLK_F_USER_RECOVERY); 4655 4656 /* 4657 * For USER_COPY, we depends on userspace to fill request 4658 * buffer by pwrite() to ublk char device, which can't be 4659 * used for unprivileged device 4660 * 4661 * Same with zero copy or auto buffer register. 4662 */ 4663 if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | 4664 UBLK_F_AUTO_BUF_REG)) 4665 return -EINVAL; 4666 } 4667 4668 /* User copy is required to access integrity buffer */ 4669 if (info.flags & UBLK_F_INTEGRITY && !(info.flags & UBLK_F_USER_COPY)) 4670 return -EINVAL; 4671 4672 /* the created device is always owned by current user */ 4673 ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid); 4674 4675 if (header->dev_id != info.dev_id) { 4676 pr_warn("%s: dev id not match %u %u\n", 4677 __func__, header->dev_id, info.dev_id); 4678 return -EINVAL; 4679 } 4680 4681 if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) { 4682 pr_warn("%s: dev id is too large. Max supported is %d\n", 4683 __func__, UBLK_MAX_UBLKS - 1); 4684 return -EINVAL; 4685 } 4686 4687 ublk_dump_dev_info(&info); 4688 4689 ret = mutex_lock_killable(&ublk_ctl_mutex); 4690 if (ret) 4691 return ret; 4692 4693 ret = -EACCES; 4694 if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) && 4695 unprivileged_ublks_added >= unprivileged_ublks_max) 4696 goto out_unlock; 4697 4698 ret = -ENOMEM; 4699 ub = kzalloc_flex(*ub, queues, info.nr_hw_queues); 4700 if (!ub) 4701 goto out_unlock; 4702 mutex_init(&ub->mutex); 4703 spin_lock_init(&ub->lock); 4704 mutex_init(&ub->cancel_mutex); 4705 mt_init(&ub->buf_tree); 4706 ida_init(&ub->buf_ida); 4707 INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work); 4708 4709 ret = ublk_alloc_dev_number(ub, header->dev_id); 4710 if (ret < 0) 4711 goto out_free_ub; 4712 4713 memcpy(&ub->dev_info, &info, sizeof(info)); 4714 4715 /* update device id */ 4716 ub->dev_info.dev_id = ub->ub_number; 4717 4718 /* 4719 * 64bit flags will be copied back to userspace as feature 4720 * negotiation result, so have to clear flags which driver 4721 * doesn't support yet, then userspace can get correct flags 4722 * (features) to handle. 4723 */ 4724 ub->dev_info.flags &= UBLK_F_ALL; 4725 4726 ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | 4727 UBLK_F_URING_CMD_COMP_IN_TASK | 4728 UBLK_F_PER_IO_DAEMON | 4729 UBLK_F_BUF_REG_OFF_DAEMON | 4730 UBLK_F_SAFE_STOP_DEV; 4731 4732 /* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */ 4733 if (ublk_dev_support_batch_io(ub)) 4734 ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON; 4735 4736 /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */ 4737 if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | 4738 UBLK_F_AUTO_BUF_REG)) 4739 ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; 4740 4741 /* UBLK_F_BATCH_IO doesn't support GET_DATA */ 4742 if (ublk_dev_support_batch_io(ub)) 4743 ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; 4744 4745 /* 4746 * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for 4747 * returning write_append_lba, which is only allowed in case of 4748 * user copy or zero copy 4749 */ 4750 if (ublk_dev_is_zoned(ub) && 4751 (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags & 4752 (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) { 4753 ret = -EINVAL; 4754 goto out_free_dev_number; 4755 } 4756 4757 ub->dev_info.nr_hw_queues = min_t(unsigned int, 4758 ub->dev_info.nr_hw_queues, nr_cpu_ids); 4759 ublk_align_max_io_size(ub); 4760 4761 ret = ublk_add_tag_set(ub); 4762 if (ret) 4763 goto out_free_dev_number; 4764 4765 ret = ublk_init_queues(ub); 4766 if (ret) 4767 goto out_free_tag_set; 4768 4769 ret = -EFAULT; 4770 if (copy_to_user(argp, &ub->dev_info, sizeof(info))) 4771 goto out_deinit_queues; 4772 4773 /* 4774 * Add the char dev so that ublksrv daemon can be setup. 4775 * ublk_add_chdev() will cleanup everything if it fails. 4776 */ 4777 ret = ublk_add_chdev(ub); 4778 goto out_unlock; 4779 4780out_deinit_queues: 4781 ublk_deinit_queues(ub); 4782out_free_tag_set: 4783 blk_mq_free_tag_set(&ub->tag_set); 4784out_free_dev_number: 4785 ublk_free_dev_number(ub); 4786out_free_ub: 4787 mutex_destroy(&ub->mutex); 4788 mutex_destroy(&ub->cancel_mutex); 4789 kfree(ub); 4790out_unlock: 4791 mutex_unlock(&ublk_ctl_mutex); 4792 return ret; 4793} 4794 4795static inline bool ublk_idr_freed(int id) 4796{ 4797 void *ptr; 4798 4799 spin_lock(&ublk_idr_lock); 4800 ptr = idr_find(&ublk_index_idr, id); 4801 spin_unlock(&ublk_idr_lock); 4802 4803 return ptr == NULL; 4804} 4805 4806static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait) 4807{ 4808 struct ublk_device *ub = *p_ub; 4809 int idx = ub->ub_number; 4810 int ret; 4811 4812 ret = mutex_lock_killable(&ublk_ctl_mutex); 4813 if (ret) 4814 return ret; 4815 4816 if (!test_bit(UB_STATE_DELETED, &ub->state)) { 4817 ublk_remove(ub); 4818 set_bit(UB_STATE_DELETED, &ub->state); 4819 } 4820 4821 /* Mark the reference as consumed */ 4822 *p_ub = NULL; 4823 ublk_put_device(ub); 4824 mutex_unlock(&ublk_ctl_mutex); 4825 4826 /* 4827 * Wait until the idr is removed, then it can be reused after 4828 * DEL_DEV command is returned. 4829 * 4830 * If we returns because of user interrupt, future delete command 4831 * may come: 4832 * 4833 * - the device number isn't freed, this device won't or needn't 4834 * be deleted again, since UB_STATE_DELETED is set, and device 4835 * will be released after the last reference is dropped 4836 * 4837 * - the device number is freed already, we will not find this 4838 * device via ublk_get_device_from_id() 4839 */ 4840 if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) 4841 return -EINTR; 4842 return 0; 4843} 4844 4845static inline void ublk_ctrl_cmd_dump(u32 cmd_op, 4846 const struct ublksrv_ctrl_cmd *header) 4847{ 4848 pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n", 4849 __func__, cmd_op, header->dev_id, header->queue_id, 4850 header->data[0], header->addr, header->len); 4851} 4852 4853static void ublk_ctrl_stop_dev(struct ublk_device *ub) 4854{ 4855 ublk_stop_dev(ub); 4856} 4857 4858static int ublk_ctrl_try_stop_dev(struct ublk_device *ub) 4859{ 4860 struct gendisk *disk; 4861 int ret = 0; 4862 4863 disk = ublk_get_disk(ub); 4864 if (!disk) 4865 return -ENODEV; 4866 4867 mutex_lock(&disk->open_mutex); 4868 if (disk_openers(disk) > 0) { 4869 ret = -EBUSY; 4870 goto unlock; 4871 } 4872 ub->block_open = true; 4873 /* release open_mutex as del_gendisk() will reacquire it */ 4874 mutex_unlock(&disk->open_mutex); 4875 4876 ublk_ctrl_stop_dev(ub); 4877 goto out; 4878 4879unlock: 4880 mutex_unlock(&disk->open_mutex); 4881out: 4882 ublk_put_disk(disk); 4883 return ret; 4884} 4885 4886static int ublk_ctrl_get_dev_info(struct ublk_device *ub, 4887 const struct ublksrv_ctrl_cmd *header) 4888{ 4889 struct task_struct *p; 4890 struct pid *pid; 4891 struct ublksrv_ctrl_dev_info dev_info; 4892 pid_t init_ublksrv_tgid = ub->dev_info.ublksrv_pid; 4893 void __user *argp = (void __user *)(unsigned long)header->addr; 4894 4895 if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr) 4896 return -EINVAL; 4897 4898 memcpy(&dev_info, &ub->dev_info, sizeof(dev_info)); 4899 dev_info.ublksrv_pid = -1; 4900 4901 if (init_ublksrv_tgid > 0) { 4902 rcu_read_lock(); 4903 pid = find_pid_ns(init_ublksrv_tgid, &init_pid_ns); 4904 p = pid_task(pid, PIDTYPE_TGID); 4905 if (p) { 4906 int vnr = task_tgid_vnr(p); 4907 4908 if (vnr) 4909 dev_info.ublksrv_pid = vnr; 4910 } 4911 rcu_read_unlock(); 4912 } 4913 4914 if (copy_to_user(argp, &dev_info, sizeof(dev_info))) 4915 return -EFAULT; 4916 4917 return 0; 4918} 4919 4920/* TYPE_DEVT is readonly, so fill it up before returning to userspace */ 4921static void ublk_ctrl_fill_params_devt(struct ublk_device *ub) 4922{ 4923 ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt); 4924 ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt); 4925 4926 if (ub->ub_disk) { 4927 ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk)); 4928 ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk)); 4929 } else { 4930 ub->params.devt.disk_major = 0; 4931 ub->params.devt.disk_minor = 0; 4932 } 4933 ub->params.types |= UBLK_PARAM_TYPE_DEVT; 4934} 4935 4936static int ublk_ctrl_get_params(struct ublk_device *ub, 4937 const struct ublksrv_ctrl_cmd *header) 4938{ 4939 void __user *argp = (void __user *)(unsigned long)header->addr; 4940 struct ublk_params_header ph; 4941 int ret; 4942 4943 if (header->len <= sizeof(ph) || !header->addr) 4944 return -EINVAL; 4945 4946 if (copy_from_user(&ph, argp, sizeof(ph))) 4947 return -EFAULT; 4948 4949 if (ph.len > header->len || !ph.len) 4950 return -EINVAL; 4951 4952 if (ph.len > sizeof(struct ublk_params)) 4953 ph.len = sizeof(struct ublk_params); 4954 4955 mutex_lock(&ub->mutex); 4956 ublk_ctrl_fill_params_devt(ub); 4957 if (copy_to_user(argp, &ub->params, ph.len)) 4958 ret = -EFAULT; 4959 else 4960 ret = 0; 4961 mutex_unlock(&ub->mutex); 4962 4963 return ret; 4964} 4965 4966static int ublk_ctrl_set_params(struct ublk_device *ub, 4967 const struct ublksrv_ctrl_cmd *header) 4968{ 4969 void __user *argp = (void __user *)(unsigned long)header->addr; 4970 struct ublk_params_header ph; 4971 int ret = -EFAULT; 4972 4973 if (header->len <= sizeof(ph) || !header->addr) 4974 return -EINVAL; 4975 4976 if (copy_from_user(&ph, argp, sizeof(ph))) 4977 return -EFAULT; 4978 4979 if (ph.len > header->len || !ph.len || !ph.types) 4980 return -EINVAL; 4981 4982 if (ph.len > sizeof(struct ublk_params)) 4983 ph.len = sizeof(struct ublk_params); 4984 4985 mutex_lock(&ub->mutex); 4986 if (test_bit(UB_STATE_USED, &ub->state)) { 4987 /* 4988 * Parameters can only be changed when device hasn't 4989 * been started yet 4990 */ 4991 ret = -EACCES; 4992 } else if (copy_from_user(&ub->params, argp, ph.len)) { 4993 ret = -EFAULT; 4994 } else { 4995 /* clear all we don't support yet */ 4996 ub->params.types &= UBLK_PARAM_TYPE_ALL; 4997 ret = ublk_validate_params(ub); 4998 if (ret) 4999 ub->params.types = 0; 5000 } 5001 mutex_unlock(&ub->mutex); 5002 5003 return ret; 5004} 5005 5006static int ublk_ctrl_start_recovery(struct ublk_device *ub) 5007{ 5008 int ret = -EINVAL; 5009 5010 mutex_lock(&ub->mutex); 5011 if (ublk_nosrv_should_stop_dev(ub)) 5012 goto out_unlock; 5013 /* 5014 * START_RECOVERY is only allowd after: 5015 * 5016 * (1) UB_STATE_OPEN is not set, which means the dying process is exited 5017 * and related io_uring ctx is freed so file struct of /dev/ublkcX is 5018 * released. 5019 * 5020 * and one of the following holds 5021 * 5022 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work: 5023 * (a)has quiesced request queue 5024 * (b)has requeued every inflight rqs whose io_flags is ACTIVE 5025 * (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE 5026 * (d)has completed/camceled all ioucmds owned by ther dying process 5027 * 5028 * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not 5029 * quiesced, but all I/O is being immediately errored 5030 */ 5031 if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) { 5032 ret = -EBUSY; 5033 goto out_unlock; 5034 } 5035 pr_devel("%s: start recovery for dev id %d\n", __func__, ub->ub_number); 5036 init_completion(&ub->completion); 5037 ret = 0; 5038 out_unlock: 5039 mutex_unlock(&ub->mutex); 5040 return ret; 5041} 5042 5043static int ublk_ctrl_end_recovery(struct ublk_device *ub, 5044 const struct ublksrv_ctrl_cmd *header) 5045{ 5046 int ublksrv_pid = (int)header->data[0]; 5047 int ret = -EINVAL; 5048 5049 pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__, 5050 header->dev_id); 5051 5052 if (wait_for_completion_interruptible(&ub->completion)) 5053 return -EINTR; 5054 5055 pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__, 5056 header->dev_id); 5057 5058 if (!ublk_validate_user_pid(ub, ublksrv_pid)) 5059 return -EINVAL; 5060 5061 mutex_lock(&ub->mutex); 5062 if (ublk_nosrv_should_stop_dev(ub)) 5063 goto out_unlock; 5064 5065 if (!ublk_dev_in_recoverable_state(ub)) { 5066 ret = -EBUSY; 5067 goto out_unlock; 5068 } 5069 ub->dev_info.ublksrv_pid = ub->ublksrv_tgid; 5070 ub->dev_info.state = UBLK_S_DEV_LIVE; 5071 pr_devel("%s: new ublksrv_pid %d, dev id %d\n", 5072 __func__, ublksrv_pid, header->dev_id); 5073 blk_mq_kick_requeue_list(ub->ub_disk->queue); 5074 ret = 0; 5075 out_unlock: 5076 mutex_unlock(&ub->mutex); 5077 return ret; 5078} 5079 5080static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header) 5081{ 5082 void __user *argp = (void __user *)(unsigned long)header->addr; 5083 u64 features = UBLK_F_ALL; 5084 5085 if (header->len != UBLK_FEATURES_LEN || !header->addr) 5086 return -EINVAL; 5087 5088 if (copy_to_user(argp, &features, UBLK_FEATURES_LEN)) 5089 return -EFAULT; 5090 5091 return 0; 5092} 5093 5094static int ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header) 5095{ 5096 struct ublk_param_basic *p = &ub->params.basic; 5097 u64 new_size = header->data[0]; 5098 int ret = 0; 5099 5100 mutex_lock(&ub->mutex); 5101 if (!ub->ub_disk) { 5102 ret = -ENODEV; 5103 goto out; 5104 } 5105 p->dev_sectors = new_size; 5106 set_capacity_and_notify(ub->ub_disk, p->dev_sectors); 5107out: 5108 mutex_unlock(&ub->mutex); 5109 return ret; 5110} 5111 5112struct count_busy { 5113 const struct ublk_queue *ubq; 5114 unsigned int nr_busy; 5115}; 5116 5117static bool ublk_count_busy_req(struct request *rq, void *data) 5118{ 5119 struct count_busy *idle = data; 5120 5121 if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq) 5122 idle->nr_busy += 1; 5123 return true; 5124} 5125 5126/* uring_cmd is guaranteed to be active if the associated request is idle */ 5127static bool ubq_has_idle_io(const struct ublk_queue *ubq) 5128{ 5129 struct count_busy data = { 5130 .ubq = ubq, 5131 }; 5132 5133 blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data); 5134 return data.nr_busy < ubq->q_depth; 5135} 5136 5137/* Wait until each hw queue has at least one idle IO */ 5138static int ublk_wait_for_idle_io(struct ublk_device *ub, 5139 unsigned int timeout_ms) 5140{ 5141 unsigned int elapsed = 0; 5142 int ret; 5143 5144 /* 5145 * For UBLK_F_BATCH_IO ublk server can get notified with existing 5146 * or new fetch command, so needn't wait any more 5147 */ 5148 if (ublk_dev_support_batch_io(ub)) 5149 return 0; 5150 5151 while (elapsed < timeout_ms && !signal_pending(current)) { 5152 unsigned int queues_cancelable = 0; 5153 int i; 5154 5155 for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { 5156 struct ublk_queue *ubq = ublk_get_queue(ub, i); 5157 5158 queues_cancelable += !!ubq_has_idle_io(ubq); 5159 } 5160 5161 /* 5162 * Each queue needs at least one active command for 5163 * notifying ublk server 5164 */ 5165 if (queues_cancelable == ub->dev_info.nr_hw_queues) 5166 break; 5167 5168 msleep(UBLK_REQUEUE_DELAY_MS); 5169 elapsed += UBLK_REQUEUE_DELAY_MS; 5170 } 5171 5172 if (signal_pending(current)) 5173 ret = -EINTR; 5174 else if (elapsed >= timeout_ms) 5175 ret = -EBUSY; 5176 else 5177 ret = 0; 5178 5179 return ret; 5180} 5181 5182static int ublk_ctrl_quiesce_dev(struct ublk_device *ub, 5183 const struct ublksrv_ctrl_cmd *header) 5184{ 5185 /* zero means wait forever */ 5186 u64 timeout_ms = header->data[0]; 5187 struct gendisk *disk; 5188 int ret = -ENODEV; 5189 5190 if (!(ub->dev_info.flags & UBLK_F_QUIESCE)) 5191 return -EOPNOTSUPP; 5192 5193 mutex_lock(&ub->mutex); 5194 disk = ublk_get_disk(ub); 5195 if (!disk) 5196 goto unlock; 5197 if (ub->dev_info.state == UBLK_S_DEV_DEAD) 5198 goto put_disk; 5199 5200 ret = 0; 5201 /* already in expected state */ 5202 if (ub->dev_info.state != UBLK_S_DEV_LIVE) 5203 goto put_disk; 5204 5205 /* Mark the device as canceling */ 5206 mutex_lock(&ub->cancel_mutex); 5207 blk_mq_quiesce_queue(disk->queue); 5208 ublk_set_canceling(ub, true); 5209 blk_mq_unquiesce_queue(disk->queue); 5210 mutex_unlock(&ub->cancel_mutex); 5211 5212 if (!timeout_ms) 5213 timeout_ms = UINT_MAX; 5214 ret = ublk_wait_for_idle_io(ub, timeout_ms); 5215 5216put_disk: 5217 ublk_put_disk(disk); 5218unlock: 5219 mutex_unlock(&ub->mutex); 5220 5221 /* Cancel pending uring_cmd */ 5222 if (!ret) 5223 ublk_cancel_dev(ub); 5224 return ret; 5225} 5226 5227/* 5228 * All control commands are sent via /dev/ublk-control, so we have to check 5229 * the destination device's permission 5230 */ 5231static int ublk_char_dev_permission(struct ublk_device *ub, 5232 const char *dev_path, int mask) 5233{ 5234 int err; 5235 struct path path; 5236 struct kstat stat; 5237 5238 err = kern_path(dev_path, LOOKUP_FOLLOW, &path); 5239 if (err) 5240 return err; 5241 5242 err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); 5243 if (err) 5244 goto exit; 5245 5246 err = -EPERM; 5247 if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode)) 5248 goto exit; 5249 5250 err = inode_permission(&nop_mnt_idmap, 5251 d_backing_inode(path.dentry), mask); 5252exit: 5253 path_put(&path); 5254 return err; 5255} 5256 5257/* 5258 * Lock for maple tree modification: acquire ub->mutex, then freeze queue 5259 * if device is started. If device is not yet started, only mutex is 5260 * needed since no I/O path can access the tree. 5261 * 5262 * This ordering (mutex -> freeze) is safe because ublk_stop_dev_unlocked() 5263 * already holds ub->mutex when calling del_gendisk() which freezes the queue. 5264*/ 5265static unsigned int ublk_lock_buf_tree(struct ublk_device *ub) 5266{ 5267 unsigned int memflags = 0; 5268 5269 mutex_lock(&ub->mutex); 5270 if (ub->ub_disk) 5271 memflags = blk_mq_freeze_queue(ub->ub_disk->queue); 5272 5273 return memflags; 5274} 5275 5276static void ublk_unlock_buf_tree(struct ublk_device *ub, unsigned int memflags) 5277{ 5278 if (ub->ub_disk) 5279 blk_mq_unfreeze_queue(ub->ub_disk->queue, memflags); 5280 mutex_unlock(&ub->mutex); 5281} 5282 5283/* Erase coalesced PFN ranges from the maple tree matching buf_index */ 5284static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index) 5285{ 5286 MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX); 5287 struct ublk_buf_range *range; 5288 5289 mas_lock(&mas); 5290 mas_for_each(&mas, range, ULONG_MAX) { 5291 if (range->buf_index == buf_index) { 5292 mas_erase(&mas); 5293 kfree(range); 5294 } 5295 } 5296 mas_unlock(&mas); 5297} 5298 5299static int __ublk_ctrl_reg_buf(struct ublk_device *ub, 5300 struct page **pages, unsigned long nr_pages, 5301 int index, unsigned short flags) 5302{ 5303 unsigned long i; 5304 int ret; 5305 5306 for (i = 0; i < nr_pages; i++) { 5307 unsigned long pfn = page_to_pfn(pages[i]); 5308 unsigned long start = i; 5309 struct ublk_buf_range *range; 5310 5311 /* Find run of consecutive PFNs */ 5312 while (i + 1 < nr_pages && 5313 page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1) 5314 i++; 5315 5316 range = kzalloc(sizeof(*range), GFP_KERNEL); 5317 if (!range) { 5318 ret = -ENOMEM; 5319 goto unwind; 5320 } 5321 range->buf_index = index; 5322 range->flags = flags; 5323 range->base_offset = start << PAGE_SHIFT; 5324 5325 ret = mtree_insert_range(&ub->buf_tree, pfn, 5326 pfn + (i - start), 5327 range, GFP_KERNEL); 5328 if (ret) { 5329 kfree(range); 5330 goto unwind; 5331 } 5332 } 5333 return 0; 5334 5335unwind: 5336 ublk_buf_erase_ranges(ub, index); 5337 return ret; 5338} 5339 5340/* 5341 * Register a shared memory buffer for zero-copy I/O. 5342 * Pins pages, builds PFN maple tree, freezes/unfreezes the queue 5343 * internally. Returns buffer index (>= 0) on success. 5344 */ 5345static int ublk_ctrl_reg_buf(struct ublk_device *ub, 5346 struct ublksrv_ctrl_cmd *header) 5347{ 5348 void __user *argp = (void __user *)(unsigned long)header->addr; 5349 struct ublk_shmem_buf_reg buf_reg; 5350 unsigned long nr_pages; 5351 struct page **pages = NULL; 5352 unsigned int gup_flags; 5353 unsigned int memflags; 5354 long pinned; 5355 int index; 5356 int ret; 5357 5358 if (!ublk_dev_support_shmem_zc(ub)) 5359 return -EOPNOTSUPP; 5360 5361 memset(&buf_reg, 0, sizeof(buf_reg)); 5362 if (copy_from_user(&buf_reg, argp, 5363 min_t(size_t, header->len, sizeof(buf_reg)))) 5364 return -EFAULT; 5365 5366 if (buf_reg.flags & ~UBLK_SHMEM_BUF_READ_ONLY) 5367 return -EINVAL; 5368 5369 if (buf_reg.reserved) 5370 return -EINVAL; 5371 5372 if (!buf_reg.len || buf_reg.len > UBLK_SHMEM_BUF_SIZE_MAX || 5373 !PAGE_ALIGNED(buf_reg.len) || !PAGE_ALIGNED(buf_reg.addr)) 5374 return -EINVAL; 5375 5376 nr_pages = buf_reg.len >> PAGE_SHIFT; 5377 5378 /* Pin pages before any locks (may sleep) */ 5379 pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL); 5380 if (!pages) 5381 return -ENOMEM; 5382 5383 gup_flags = FOLL_LONGTERM; 5384 if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY)) 5385 gup_flags |= FOLL_WRITE; 5386 5387 pinned = pin_user_pages_fast(buf_reg.addr, nr_pages, gup_flags, pages); 5388 if (pinned < 0) { 5389 ret = pinned; 5390 goto err_free_pages; 5391 } 5392 if (pinned != nr_pages) { 5393 ret = -EFAULT; 5394 goto err_unpin; 5395 } 5396 5397 memflags = ublk_lock_buf_tree(ub); 5398 5399 index = ida_alloc_max(&ub->buf_ida, USHRT_MAX, GFP_KERNEL); 5400 if (index < 0) { 5401 ret = index; 5402 goto err_unlock; 5403 } 5404 5405 ret = __ublk_ctrl_reg_buf(ub, pages, nr_pages, index, buf_reg.flags); 5406 if (ret) { 5407 ida_free(&ub->buf_ida, index); 5408 goto err_unlock; 5409 } 5410 5411 ublk_unlock_buf_tree(ub, memflags); 5412 kvfree(pages); 5413 return index; 5414 5415err_unlock: 5416 ublk_unlock_buf_tree(ub, memflags); 5417err_unpin: 5418 unpin_user_pages(pages, pinned); 5419err_free_pages: 5420 kvfree(pages); 5421 return ret; 5422} 5423 5424static void ublk_unpin_range_pages(unsigned long base_pfn, 5425 unsigned long nr_pages) 5426{ 5427#define UBLK_UNPIN_BATCH 32 5428 struct page *pages[UBLK_UNPIN_BATCH]; 5429 unsigned long off; 5430 5431 for (off = 0; off < nr_pages; ) { 5432 unsigned int batch = min_t(unsigned long, 5433 nr_pages - off, UBLK_UNPIN_BATCH); 5434 unsigned int j; 5435 5436 for (j = 0; j < batch; j++) 5437 pages[j] = pfn_to_page(base_pfn + off + j); 5438 unpin_user_pages(pages, batch); 5439 off += batch; 5440 } 5441} 5442 5443/* 5444 * Inner loop: erase up to UBLK_REMOVE_BATCH matching ranges under 5445 * mas_lock, collecting them into an xarray. Then drop the lock and 5446 * unpin pages + free ranges outside spinlock context. 5447 * 5448 * Returns true if the tree walk completed, false if more ranges remain. 5449 * Xarray key is the base PFN, value encodes nr_pages via xa_mk_value(). 5450 */ 5451#define UBLK_REMOVE_BATCH 64 5452 5453static bool __ublk_shmem_remove_ranges(struct ublk_device *ub, 5454 int buf_index, int *ret) 5455{ 5456 MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX); 5457 struct ublk_buf_range *range; 5458 struct xarray to_unpin; 5459 unsigned long idx; 5460 unsigned int count = 0; 5461 bool done = false; 5462 void *entry; 5463 5464 xa_init(&to_unpin); 5465 5466 mas_lock(&mas); 5467 mas_for_each(&mas, range, ULONG_MAX) { 5468 unsigned long nr; 5469 5470 if (buf_index >= 0 && range->buf_index != buf_index) 5471 continue; 5472 5473 *ret = 0; 5474 nr = mas.last - mas.index + 1; 5475 if (xa_err(xa_store(&to_unpin, mas.index, 5476 xa_mk_value(nr), GFP_ATOMIC))) 5477 goto unlock; 5478 mas_erase(&mas); 5479 kfree(range); 5480 if (++count >= UBLK_REMOVE_BATCH) 5481 goto unlock; 5482 } 5483 done = true; 5484unlock: 5485 mas_unlock(&mas); 5486 5487 xa_for_each(&to_unpin, idx, entry) 5488 ublk_unpin_range_pages(idx, xa_to_value(entry)); 5489 xa_destroy(&to_unpin); 5490 5491 return done; 5492} 5493 5494/* 5495 * Remove ranges from the maple tree matching buf_index, unpin pages 5496 * and free range structs. If buf_index < 0, remove all ranges. 5497 * Processes ranges in batches to avoid holding the maple tree spinlock 5498 * across potentially expensive page unpinning. 5499 */ 5500static int ublk_shmem_remove_ranges(struct ublk_device *ub, int buf_index) 5501{ 5502 int ret = -ENOENT; 5503 5504 while (!__ublk_shmem_remove_ranges(ub, buf_index, &ret)) 5505 cond_resched(); 5506 return ret; 5507} 5508 5509static int ublk_ctrl_unreg_buf(struct ublk_device *ub, 5510 struct ublksrv_ctrl_cmd *header) 5511{ 5512 int index = (int)header->data[0]; 5513 unsigned int memflags; 5514 int ret; 5515 5516 if (!ublk_dev_support_shmem_zc(ub)) 5517 return -EOPNOTSUPP; 5518 5519 if (index < 0 || index > USHRT_MAX) 5520 return -EINVAL; 5521 5522 memflags = ublk_lock_buf_tree(ub); 5523 5524 ret = ublk_shmem_remove_ranges(ub, index); 5525 if (!ret) 5526 ida_free(&ub->buf_ida, index); 5527 5528 ublk_unlock_buf_tree(ub, memflags); 5529 return ret; 5530} 5531 5532static void ublk_buf_cleanup(struct ublk_device *ub) 5533{ 5534 ublk_shmem_remove_ranges(ub, -1); 5535 mtree_destroy(&ub->buf_tree); 5536 ida_destroy(&ub->buf_ida); 5537} 5538 5539/* Check if request pages match a registered shared memory buffer */ 5540static bool ublk_try_buf_match(struct ublk_device *ub, 5541 struct request *rq, 5542 u32 *buf_idx, u32 *buf_off) 5543{ 5544 struct req_iterator iter; 5545 struct bio_vec bv; 5546 int index = -1; 5547 unsigned long expected_offset = 0; 5548 bool first = true; 5549 5550 rq_for_each_bvec(bv, rq, iter) { 5551 unsigned long pfn = page_to_pfn(bv.bv_page); 5552 unsigned long end_pfn = pfn + 5553 ((bv.bv_offset + bv.bv_len - 1) >> PAGE_SHIFT); 5554 struct ublk_buf_range *range; 5555 unsigned long off; 5556 MA_STATE(mas, &ub->buf_tree, pfn, pfn); 5557 5558 range = mas_walk(&mas); 5559 if (!range) 5560 return false; 5561 5562 /* verify all pages in this bvec fall within the range */ 5563 if (end_pfn > mas.last) 5564 return false; 5565 5566 off = range->base_offset + 5567 (pfn - mas.index) * PAGE_SIZE + bv.bv_offset; 5568 5569 if (first) { 5570 /* Read-only buffer can't serve READ (kernel writes) */ 5571 if ((range->flags & UBLK_SHMEM_BUF_READ_ONLY) && 5572 req_op(rq) != REQ_OP_WRITE) 5573 return false; 5574 index = range->buf_index; 5575 expected_offset = off; 5576 *buf_off = off; 5577 first = false; 5578 } else { 5579 if (range->buf_index != index) 5580 return false; 5581 if (off != expected_offset) 5582 return false; 5583 } 5584 expected_offset += bv.bv_len; 5585 } 5586 5587 if (first) 5588 return false; 5589 5590 *buf_idx = index; 5591 return true; 5592} 5593 5594static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, 5595 u32 cmd_op, struct ublksrv_ctrl_cmd *header) 5596{ 5597 bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV; 5598 void __user *argp = (void __user *)(unsigned long)header->addr; 5599 char *dev_path = NULL; 5600 int ret = 0; 5601 int mask; 5602 5603 if (!unprivileged) { 5604 if (!capable(CAP_SYS_ADMIN)) 5605 return -EPERM; 5606 /* 5607 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes 5608 * char_dev_path in payload too, since userspace may not 5609 * know if the specified device is created as unprivileged 5610 * mode. 5611 */ 5612 if (_IOC_NR(cmd_op) != UBLK_CMD_GET_DEV_INFO2) 5613 return 0; 5614 } 5615 5616 /* 5617 * User has to provide the char device path for unprivileged ublk 5618 * 5619 * header->addr always points to the dev path buffer, and 5620 * header->dev_path_len records length of dev path buffer. 5621 */ 5622 if (!header->dev_path_len || header->dev_path_len > PATH_MAX) 5623 return -EINVAL; 5624 5625 if (header->len < header->dev_path_len) 5626 return -EINVAL; 5627 5628 dev_path = memdup_user_nul(argp, header->dev_path_len); 5629 if (IS_ERR(dev_path)) 5630 return PTR_ERR(dev_path); 5631 5632 ret = -EINVAL; 5633 switch (_IOC_NR(cmd_op)) { 5634 case UBLK_CMD_GET_DEV_INFO: 5635 case UBLK_CMD_GET_DEV_INFO2: 5636 case UBLK_CMD_GET_QUEUE_AFFINITY: 5637 case UBLK_CMD_GET_PARAMS: 5638 case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)): 5639 mask = MAY_READ; 5640 break; 5641 case UBLK_CMD_START_DEV: 5642 case UBLK_CMD_STOP_DEV: 5643 case UBLK_CMD_ADD_DEV: 5644 case UBLK_CMD_DEL_DEV: 5645 case UBLK_CMD_SET_PARAMS: 5646 case UBLK_CMD_START_USER_RECOVERY: 5647 case UBLK_CMD_END_USER_RECOVERY: 5648 case UBLK_CMD_UPDATE_SIZE: 5649 case UBLK_CMD_QUIESCE_DEV: 5650 case UBLK_CMD_TRY_STOP_DEV: 5651 case UBLK_CMD_REG_BUF: 5652 case UBLK_CMD_UNREG_BUF: 5653 mask = MAY_READ | MAY_WRITE; 5654 break; 5655 default: 5656 goto exit; 5657 } 5658 5659 ret = ublk_char_dev_permission(ub, dev_path, mask); 5660 if (!ret) { 5661 header->len -= header->dev_path_len; 5662 header->addr += header->dev_path_len; 5663 } 5664 pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n", 5665 __func__, ub->ub_number, cmd_op, 5666 ub->dev_info.owner_uid, ub->dev_info.owner_gid, 5667 dev_path, ret); 5668exit: 5669 kfree(dev_path); 5670 return ret; 5671} 5672 5673static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op) 5674{ 5675 switch (_IOC_NR(cmd_op)) { 5676 case UBLK_CMD_GET_QUEUE_AFFINITY: 5677 case UBLK_CMD_GET_DEV_INFO: 5678 case UBLK_CMD_GET_DEV_INFO2: 5679 case _IOC_NR(UBLK_U_CMD_GET_FEATURES): 5680 return false; 5681 default: 5682 return true; 5683 } 5684} 5685 5686static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, 5687 unsigned int issue_flags) 5688{ 5689 /* May point to userspace-mapped memory */ 5690 const struct ublksrv_ctrl_cmd *ub_src = io_uring_sqe128_cmd(cmd->sqe, 5691 struct ublksrv_ctrl_cmd); 5692 struct ublksrv_ctrl_cmd header; 5693 struct ublk_device *ub = NULL; 5694 u32 cmd_op = cmd->cmd_op; 5695 int ret = -EINVAL; 5696 5697 if (ublk_ctrl_uring_cmd_may_sleep(cmd_op) && 5698 issue_flags & IO_URING_F_NONBLOCK) 5699 return -EAGAIN; 5700 5701 if (!(issue_flags & IO_URING_F_SQE128)) 5702 return -EINVAL; 5703 5704 header.dev_id = READ_ONCE(ub_src->dev_id); 5705 header.queue_id = READ_ONCE(ub_src->queue_id); 5706 header.len = READ_ONCE(ub_src->len); 5707 header.addr = READ_ONCE(ub_src->addr); 5708 header.data[0] = READ_ONCE(ub_src->data[0]); 5709 header.dev_path_len = READ_ONCE(ub_src->dev_path_len); 5710 ublk_ctrl_cmd_dump(cmd_op, &header); 5711 5712 ret = ublk_check_cmd_op(cmd_op); 5713 if (ret) 5714 goto out; 5715 5716 if (cmd_op == UBLK_U_CMD_GET_FEATURES) { 5717 ret = ublk_ctrl_get_features(&header); 5718 goto out; 5719 } 5720 5721 if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) { 5722 ret = -ENODEV; 5723 ub = ublk_get_device_from_id(header.dev_id); 5724 if (!ub) 5725 goto out; 5726 5727 ret = ublk_ctrl_uring_cmd_permission(ub, cmd_op, &header); 5728 if (ret) 5729 goto put_dev; 5730 } 5731 5732 switch (_IOC_NR(cmd_op)) { 5733 case UBLK_CMD_START_DEV: 5734 ret = ublk_ctrl_start_dev(ub, &header); 5735 break; 5736 case UBLK_CMD_STOP_DEV: 5737 ublk_ctrl_stop_dev(ub); 5738 ret = 0; 5739 break; 5740 case UBLK_CMD_GET_DEV_INFO: 5741 case UBLK_CMD_GET_DEV_INFO2: 5742 ret = ublk_ctrl_get_dev_info(ub, &header); 5743 break; 5744 case UBLK_CMD_ADD_DEV: 5745 ret = ublk_ctrl_add_dev(&header); 5746 break; 5747 case UBLK_CMD_DEL_DEV: 5748 ret = ublk_ctrl_del_dev(&ub, true); 5749 break; 5750 case UBLK_CMD_DEL_DEV_ASYNC: 5751 ret = ublk_ctrl_del_dev(&ub, false); 5752 break; 5753 case UBLK_CMD_GET_QUEUE_AFFINITY: 5754 ret = ublk_ctrl_get_queue_affinity(ub, &header); 5755 break; 5756 case UBLK_CMD_GET_PARAMS: 5757 ret = ublk_ctrl_get_params(ub, &header); 5758 break; 5759 case UBLK_CMD_SET_PARAMS: 5760 ret = ublk_ctrl_set_params(ub, &header); 5761 break; 5762 case UBLK_CMD_START_USER_RECOVERY: 5763 ret = ublk_ctrl_start_recovery(ub); 5764 break; 5765 case UBLK_CMD_END_USER_RECOVERY: 5766 ret = ublk_ctrl_end_recovery(ub, &header); 5767 break; 5768 case UBLK_CMD_UPDATE_SIZE: 5769 ret = ublk_ctrl_set_size(ub, &header); 5770 break; 5771 case UBLK_CMD_QUIESCE_DEV: 5772 ret = ublk_ctrl_quiesce_dev(ub, &header); 5773 break; 5774 case UBLK_CMD_TRY_STOP_DEV: 5775 ret = ublk_ctrl_try_stop_dev(ub); 5776 break; 5777 case UBLK_CMD_REG_BUF: 5778 ret = ublk_ctrl_reg_buf(ub, &header); 5779 break; 5780 case UBLK_CMD_UNREG_BUF: 5781 ret = ublk_ctrl_unreg_buf(ub, &header); 5782 break; 5783 default: 5784 ret = -EOPNOTSUPP; 5785 break; 5786 } 5787 5788 put_dev: 5789 if (ub) 5790 ublk_put_device(ub); 5791 out: 5792 pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n", 5793 __func__, ret, cmd_op, header.dev_id, header.queue_id); 5794 return ret; 5795} 5796 5797static const struct file_operations ublk_ctl_fops = { 5798 .open = nonseekable_open, 5799 .uring_cmd = ublk_ctrl_uring_cmd, 5800 .owner = THIS_MODULE, 5801 .llseek = noop_llseek, 5802}; 5803 5804static struct miscdevice ublk_misc = { 5805 .minor = MISC_DYNAMIC_MINOR, 5806 .name = "ublk-control", 5807 .fops = &ublk_ctl_fops, 5808}; 5809 5810static int __init ublk_init(void) 5811{ 5812 int ret; 5813 5814 BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET + 5815 UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET); 5816 /* 5817 * Ensure UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE 5818 * doesn't overflow into UBLKSRV_IO_INTEGRITY_FLAG 5819 */ 5820 BUILD_BUG_ON(UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE >= 5821 UBLKSRV_IO_INTEGRITY_FLAG); 5822 BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8); 5823 5824 init_waitqueue_head(&ublk_idr_wq); 5825 5826 ret = misc_register(&ublk_misc); 5827 if (ret) 5828 return ret; 5829 5830 ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char"); 5831 if (ret) 5832 goto unregister_mis; 5833 5834 ret = class_register(&ublk_chr_class); 5835 if (ret) 5836 goto free_chrdev_region; 5837 5838 return 0; 5839 5840free_chrdev_region: 5841 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS); 5842unregister_mis: 5843 misc_deregister(&ublk_misc); 5844 return ret; 5845} 5846 5847static void __exit ublk_exit(void) 5848{ 5849 struct ublk_device *ub; 5850 int id; 5851 5852 idr_for_each_entry(&ublk_index_idr, ub, id) 5853 ublk_remove(ub); 5854 5855 class_unregister(&ublk_chr_class); 5856 misc_deregister(&ublk_misc); 5857 5858 idr_destroy(&ublk_index_idr); 5859 unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS); 5860} 5861 5862module_init(ublk_init); 5863module_exit(ublk_exit); 5864 5865static int ublk_set_max_unprivileged_ublks(const char *buf, 5866 const struct kernel_param *kp) 5867{ 5868 return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS); 5869} 5870 5871static int ublk_get_max_unprivileged_ublks(char *buf, 5872 const struct kernel_param *kp) 5873{ 5874 return sysfs_emit(buf, "%u\n", unprivileged_ublks_max); 5875} 5876 5877static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = { 5878 .set = ublk_set_max_unprivileged_ublks, 5879 .get = ublk_get_max_unprivileged_ublks, 5880}; 5881 5882module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops, 5883 &unprivileged_ublks_max, 0644); 5884MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)"); 5885 5886MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>"); 5887MODULE_DESCRIPTION("Userspace block device"); 5888MODULE_LICENSE("GPL");