Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * NVMe ZNS-ZBD command implementation.
4 * Copyright (C) 2021 Western Digital Corporation or its affiliates.
5 */
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7#include <linux/nvme.h>
8#include <linux/blkdev.h>
9#include "nvmet.h"
10
11/*
12 * We set the Memory Page Size Minimum (MPSMIN) for target controller to 0
13 * which gets added by 12 in the nvme_enable_ctrl() which results in 2^12 = 4k
14 * as page_shift value. When calculating the ZASL use shift by 12.
15 */
16#define NVMET_MPSMIN_SHIFT 12
17
18static inline u8 nvmet_zasl(unsigned int zone_append_sects)
19{
20 /*
21 * Zone Append Size Limit (zasl) is expressed as a power of 2 value
22 * with the minimum memory page size (i.e. 12) as unit.
23 */
24 return ilog2(zone_append_sects >> (NVMET_MPSMIN_SHIFT - 9));
25}
26
27static int validate_conv_zones_cb(struct blk_zone *z,
28 unsigned int i, void *data)
29{
30 if (z->type == BLK_ZONE_TYPE_CONVENTIONAL)
31 return -EOPNOTSUPP;
32 return 0;
33}
34
35bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
36{
37 u8 zasl = nvmet_zasl(bdev_max_zone_append_sectors(ns->bdev));
38 struct gendisk *bd_disk = ns->bdev->bd_disk;
39 int ret;
40
41 if (ns->subsys->zasl) {
42 if (ns->subsys->zasl > zasl)
43 return false;
44 }
45 ns->subsys->zasl = zasl;
46
47 /*
48 * Generic zoned block devices may have a smaller last zone which is
49 * not supported by ZNS. Exclude zoned drives that have such smaller
50 * last zone.
51 */
52 if (get_capacity(bd_disk) & (bdev_zone_sectors(ns->bdev) - 1))
53 return false;
54 /*
55 * ZNS does not define a conventional zone type. Use report zones
56 * to detect if the device has conventional zones and reject it if
57 * it does.
58 */
59 ret = blkdev_report_zones(ns->bdev, 0, bdev_nr_zones(ns->bdev),
60 validate_conv_zones_cb, NULL);
61 if (ret < 0)
62 return false;
63
64 ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev));
65
66 return true;
67}
68
69void nvmet_execute_identify_ctrl_zns(struct nvmet_req *req)
70{
71 u8 zasl = req->sq->ctrl->subsys->zasl;
72 struct nvme_id_ctrl_zns *id;
73 u16 status;
74
75 id = kzalloc_obj(*id);
76 if (!id) {
77 status = NVME_SC_INTERNAL;
78 goto out;
79 }
80
81 id->zasl = min_not_zero(nvmet_ctrl_mdts(req), zasl);
82
83 status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
84
85 kfree(id);
86out:
87 nvmet_req_complete(req, status);
88}
89
90void nvmet_execute_identify_ns_zns(struct nvmet_req *req)
91{
92 struct nvme_id_ns_zns *id_zns = NULL;
93 u64 zsze;
94 u16 status;
95 u32 mar, mor;
96
97 if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
98 req->error_loc = offsetof(struct nvme_identify, nsid);
99 status = NVME_SC_INVALID_NS | NVME_STATUS_DNR;
100 goto out;
101 }
102
103 id_zns = kzalloc_obj(*id_zns);
104 if (!id_zns) {
105 status = NVME_SC_INTERNAL;
106 goto out;
107 }
108
109 status = nvmet_req_find_ns(req);
110 if (status)
111 goto done;
112
113 if (nvmet_ns_revalidate(req->ns)) {
114 mutex_lock(&req->ns->subsys->lock);
115 nvmet_ns_changed(req->ns->subsys, req->ns->nsid);
116 mutex_unlock(&req->ns->subsys->lock);
117 }
118
119 if (!bdev_is_zoned(req->ns->bdev)) {
120 status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
121 req->error_loc = offsetof(struct nvme_identify, nsid);
122 goto out;
123 }
124
125 zsze = (bdev_zone_sectors(req->ns->bdev) << 9) >>
126 req->ns->blksize_shift;
127 id_zns->lbafe[0].zsze = cpu_to_le64(zsze);
128
129 mor = bdev_max_open_zones(req->ns->bdev);
130 if (!mor)
131 mor = U32_MAX;
132 else
133 mor--;
134 id_zns->mor = cpu_to_le32(mor);
135
136 mar = bdev_max_active_zones(req->ns->bdev);
137 if (!mar)
138 mar = U32_MAX;
139 else
140 mar--;
141 id_zns->mar = cpu_to_le32(mar);
142
143done:
144 status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns));
145out:
146 kfree(id_zns);
147 nvmet_req_complete(req, status);
148}
149
150static u16 nvmet_bdev_validate_zone_mgmt_recv(struct nvmet_req *req)
151{
152 sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba);
153 u32 out_bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2;
154
155 if (sect >= get_capacity(req->ns->bdev->bd_disk)) {
156 req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, slba);
157 return NVME_SC_LBA_RANGE | NVME_STATUS_DNR;
158 }
159
160 if (out_bufsize < sizeof(struct nvme_zone_report)) {
161 req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, numd);
162 return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
163 }
164
165 if (req->cmd->zmr.zra != NVME_ZRA_ZONE_REPORT) {
166 req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, zra);
167 return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
168 }
169
170 switch (req->cmd->zmr.pr) {
171 case 0:
172 case 1:
173 break;
174 default:
175 req->error_loc = offsetof(struct nvme_zone_mgmt_recv_cmd, pr);
176 return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
177 }
178
179 switch (req->cmd->zmr.zrasf) {
180 case NVME_ZRASF_ZONE_REPORT_ALL:
181 case NVME_ZRASF_ZONE_STATE_EMPTY:
182 case NVME_ZRASF_ZONE_STATE_IMP_OPEN:
183 case NVME_ZRASF_ZONE_STATE_EXP_OPEN:
184 case NVME_ZRASF_ZONE_STATE_CLOSED:
185 case NVME_ZRASF_ZONE_STATE_FULL:
186 case NVME_ZRASF_ZONE_STATE_READONLY:
187 case NVME_ZRASF_ZONE_STATE_OFFLINE:
188 break;
189 default:
190 req->error_loc =
191 offsetof(struct nvme_zone_mgmt_recv_cmd, zrasf);
192 return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
193 }
194
195 return NVME_SC_SUCCESS;
196}
197
198struct nvmet_report_zone_data {
199 struct nvmet_req *req;
200 u64 out_buf_offset;
201 u64 out_nr_zones;
202 u64 nr_zones;
203 u8 zrasf;
204};
205
206static int nvmet_bdev_report_zone_cb(struct blk_zone *z, unsigned i, void *d)
207{
208 static const unsigned int nvme_zrasf_to_blk_zcond[] = {
209 [NVME_ZRASF_ZONE_STATE_EMPTY] = BLK_ZONE_COND_EMPTY,
210 [NVME_ZRASF_ZONE_STATE_IMP_OPEN] = BLK_ZONE_COND_IMP_OPEN,
211 [NVME_ZRASF_ZONE_STATE_EXP_OPEN] = BLK_ZONE_COND_EXP_OPEN,
212 [NVME_ZRASF_ZONE_STATE_CLOSED] = BLK_ZONE_COND_CLOSED,
213 [NVME_ZRASF_ZONE_STATE_READONLY] = BLK_ZONE_COND_READONLY,
214 [NVME_ZRASF_ZONE_STATE_FULL] = BLK_ZONE_COND_FULL,
215 [NVME_ZRASF_ZONE_STATE_OFFLINE] = BLK_ZONE_COND_OFFLINE,
216 };
217 struct nvmet_report_zone_data *rz = d;
218
219 if (rz->zrasf != NVME_ZRASF_ZONE_REPORT_ALL &&
220 z->cond != nvme_zrasf_to_blk_zcond[rz->zrasf])
221 return 0;
222
223 if (rz->nr_zones < rz->out_nr_zones) {
224 struct nvme_zone_descriptor zdesc = { };
225 u16 status;
226
227 zdesc.zcap = nvmet_sect_to_lba(rz->req->ns, z->capacity);
228 zdesc.zslba = nvmet_sect_to_lba(rz->req->ns, z->start);
229 zdesc.wp = nvmet_sect_to_lba(rz->req->ns, z->wp);
230 zdesc.za = z->reset ? 1 << 2 : 0;
231 zdesc.zs = z->cond << 4;
232 zdesc.zt = z->type;
233
234 status = nvmet_copy_to_sgl(rz->req, rz->out_buf_offset, &zdesc,
235 sizeof(zdesc));
236 if (status)
237 return -EINVAL;
238
239 rz->out_buf_offset += sizeof(zdesc);
240 }
241
242 rz->nr_zones++;
243
244 return 0;
245}
246
247static unsigned long nvmet_req_nr_zones_from_slba(struct nvmet_req *req)
248{
249 unsigned int sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba);
250
251 return bdev_nr_zones(req->ns->bdev) - bdev_zone_no(req->ns->bdev, sect);
252}
253
254static unsigned long get_nr_zones_from_buf(struct nvmet_req *req, u32 bufsize)
255{
256 if (bufsize <= sizeof(struct nvme_zone_report))
257 return 0;
258
259 return (bufsize - sizeof(struct nvme_zone_report)) /
260 sizeof(struct nvme_zone_descriptor);
261}
262
263static void nvmet_bdev_zone_zmgmt_recv_work(struct work_struct *w)
264{
265 struct nvmet_req *req = container_of(w, struct nvmet_req, z.zmgmt_work);
266 sector_t start_sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba);
267 unsigned long req_slba_nr_zones = nvmet_req_nr_zones_from_slba(req);
268 u32 out_bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2;
269 __le64 nr_zones;
270 u16 status;
271 int ret;
272 struct nvmet_report_zone_data rz_data = {
273 .out_nr_zones = get_nr_zones_from_buf(req, out_bufsize),
274 /* leave the place for report zone header */
275 .out_buf_offset = sizeof(struct nvme_zone_report),
276 .zrasf = req->cmd->zmr.zrasf,
277 .nr_zones = 0,
278 .req = req,
279 };
280
281 status = nvmet_bdev_validate_zone_mgmt_recv(req);
282 if (status)
283 goto out;
284
285 if (!req_slba_nr_zones) {
286 status = NVME_SC_SUCCESS;
287 goto out;
288 }
289
290 ret = blkdev_report_zones(req->ns->bdev, start_sect, req_slba_nr_zones,
291 nvmet_bdev_report_zone_cb, &rz_data);
292 if (ret < 0) {
293 status = NVME_SC_INTERNAL;
294 goto out;
295 }
296
297 /*
298 * When partial bit is set nr_zones must indicate the number of zone
299 * descriptors actually transferred.
300 */
301 if (req->cmd->zmr.pr)
302 rz_data.nr_zones = min(rz_data.nr_zones, rz_data.out_nr_zones);
303
304 nr_zones = cpu_to_le64(rz_data.nr_zones);
305 status = nvmet_copy_to_sgl(req, 0, &nr_zones, sizeof(nr_zones));
306
307out:
308 nvmet_req_complete(req, status);
309}
310
311void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
312{
313 INIT_WORK(&req->z.zmgmt_work, nvmet_bdev_zone_zmgmt_recv_work);
314 queue_work(zbd_wq, &req->z.zmgmt_work);
315}
316
317static inline enum req_op zsa_req_op(u8 zsa)
318{
319 switch (zsa) {
320 case NVME_ZONE_OPEN:
321 return REQ_OP_ZONE_OPEN;
322 case NVME_ZONE_CLOSE:
323 return REQ_OP_ZONE_CLOSE;
324 case NVME_ZONE_FINISH:
325 return REQ_OP_ZONE_FINISH;
326 case NVME_ZONE_RESET:
327 return REQ_OP_ZONE_RESET;
328 default:
329 return REQ_OP_LAST;
330 }
331}
332
333static u16 blkdev_zone_mgmt_errno_to_nvme_status(int ret)
334{
335 switch (ret) {
336 case 0:
337 return NVME_SC_SUCCESS;
338 case -EINVAL:
339 case -EIO:
340 return NVME_SC_ZONE_INVALID_TRANSITION | NVME_STATUS_DNR;
341 default:
342 return NVME_SC_INTERNAL;
343 }
344}
345
346struct nvmet_zone_mgmt_send_all_data {
347 unsigned long *zbitmap;
348 struct nvmet_req *req;
349};
350
351static int zmgmt_send_scan_cb(struct blk_zone *z, unsigned i, void *d)
352{
353 struct nvmet_zone_mgmt_send_all_data *data = d;
354
355 switch (zsa_req_op(data->req->cmd->zms.zsa)) {
356 case REQ_OP_ZONE_OPEN:
357 switch (z->cond) {
358 case BLK_ZONE_COND_CLOSED:
359 break;
360 default:
361 return 0;
362 }
363 break;
364 case REQ_OP_ZONE_CLOSE:
365 switch (z->cond) {
366 case BLK_ZONE_COND_IMP_OPEN:
367 case BLK_ZONE_COND_EXP_OPEN:
368 break;
369 default:
370 return 0;
371 }
372 break;
373 case REQ_OP_ZONE_FINISH:
374 switch (z->cond) {
375 case BLK_ZONE_COND_IMP_OPEN:
376 case BLK_ZONE_COND_EXP_OPEN:
377 case BLK_ZONE_COND_CLOSED:
378 break;
379 default:
380 return 0;
381 }
382 break;
383 default:
384 return -EINVAL;
385 }
386
387 set_bit(i, data->zbitmap);
388
389 return 0;
390}
391
392static u16 nvmet_bdev_zone_mgmt_emulate_all(struct nvmet_req *req)
393{
394 struct block_device *bdev = req->ns->bdev;
395 unsigned int nr_zones = bdev_nr_zones(bdev);
396 struct bio *bio = NULL;
397 sector_t sector = 0;
398 int ret;
399 struct nvmet_zone_mgmt_send_all_data d = {
400 .req = req,
401 };
402
403 d.zbitmap = kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(*(d.zbitmap)),
404 GFP_NOIO, bdev->bd_disk->node_id);
405 if (!d.zbitmap) {
406 ret = -ENOMEM;
407 goto out;
408 }
409
410 /* Scan and build bitmap of the eligible zones */
411 ret = blkdev_report_zones(bdev, 0, nr_zones, zmgmt_send_scan_cb, &d);
412 if (ret != nr_zones) {
413 if (ret > 0)
414 ret = -EIO;
415 goto out;
416 } else {
417 /* We scanned all the zones */
418 ret = 0;
419 }
420
421 while (sector < bdev_nr_sectors(bdev)) {
422 if (test_bit(disk_zone_no(bdev->bd_disk, sector), d.zbitmap)) {
423 bio = blk_next_bio(bio, bdev, 0,
424 zsa_req_op(req->cmd->zms.zsa) | REQ_SYNC,
425 GFP_KERNEL);
426 bio->bi_iter.bi_sector = sector;
427 /* This may take a while, so be nice to others */
428 cond_resched();
429 }
430 sector += bdev_zone_sectors(bdev);
431 }
432
433 if (bio) {
434 ret = submit_bio_wait(bio);
435 bio_put(bio);
436 }
437
438out:
439 kfree(d.zbitmap);
440
441 return blkdev_zone_mgmt_errno_to_nvme_status(ret);
442}
443
444static u16 nvmet_bdev_execute_zmgmt_send_all(struct nvmet_req *req)
445{
446 int ret;
447
448 switch (zsa_req_op(req->cmd->zms.zsa)) {
449 case REQ_OP_ZONE_RESET:
450 ret = blkdev_zone_mgmt(req->ns->bdev, REQ_OP_ZONE_RESET, 0,
451 get_capacity(req->ns->bdev->bd_disk));
452 if (ret < 0)
453 return blkdev_zone_mgmt_errno_to_nvme_status(ret);
454 break;
455 case REQ_OP_ZONE_OPEN:
456 case REQ_OP_ZONE_CLOSE:
457 case REQ_OP_ZONE_FINISH:
458 return nvmet_bdev_zone_mgmt_emulate_all(req);
459 default:
460 /* this is needed to quiet compiler warning */
461 req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, zsa);
462 return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
463 }
464
465 return NVME_SC_SUCCESS;
466}
467
468static void nvmet_bdev_zmgmt_send_work(struct work_struct *w)
469{
470 struct nvmet_req *req = container_of(w, struct nvmet_req, z.zmgmt_work);
471 sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zms.slba);
472 enum req_op op = zsa_req_op(req->cmd->zms.zsa);
473 struct block_device *bdev = req->ns->bdev;
474 sector_t zone_sectors = bdev_zone_sectors(bdev);
475 u16 status = NVME_SC_SUCCESS;
476 int ret;
477
478 if (op == REQ_OP_LAST) {
479 req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, zsa);
480 status = NVME_SC_ZONE_INVALID_TRANSITION | NVME_STATUS_DNR;
481 goto out;
482 }
483
484 /* when select all bit is set slba field is ignored */
485 if (req->cmd->zms.select_all) {
486 status = nvmet_bdev_execute_zmgmt_send_all(req);
487 goto out;
488 }
489
490 if (sect >= get_capacity(bdev->bd_disk)) {
491 req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, slba);
492 status = NVME_SC_LBA_RANGE | NVME_STATUS_DNR;
493 goto out;
494 }
495
496 if (sect & (zone_sectors - 1)) {
497 req->error_loc = offsetof(struct nvme_zone_mgmt_send_cmd, slba);
498 status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
499 goto out;
500 }
501
502 ret = blkdev_zone_mgmt(bdev, op, sect, zone_sectors);
503 if (ret < 0)
504 status = blkdev_zone_mgmt_errno_to_nvme_status(ret);
505
506out:
507 nvmet_req_complete(req, status);
508}
509
510void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
511{
512 INIT_WORK(&req->z.zmgmt_work, nvmet_bdev_zmgmt_send_work);
513 queue_work(zbd_wq, &req->z.zmgmt_work);
514}
515
516static void nvmet_bdev_zone_append_bio_done(struct bio *bio)
517{
518 struct nvmet_req *req = bio->bi_private;
519
520 if (bio->bi_status == BLK_STS_OK) {
521 req->cqe->result.u64 =
522 nvmet_sect_to_lba(req->ns, bio->bi_iter.bi_sector);
523 }
524
525 nvmet_req_complete(req, blk_to_nvme_status(req, bio->bi_status));
526 nvmet_req_bio_put(req, bio);
527}
528
529void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
530{
531 sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->rw.slba);
532 const blk_opf_t opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
533 u16 status = NVME_SC_SUCCESS;
534 unsigned int total_len = 0;
535 struct scatterlist *sg;
536 u32 data_len = nvmet_rw_data_len(req);
537 struct bio *bio;
538 int sg_cnt;
539
540 /* Request is completed on len mismatch in nvmet_check_transfer_len() */
541 if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
542 return;
543
544 if (data_len >
545 bdev_max_zone_append_sectors(req->ns->bdev) << SECTOR_SHIFT) {
546 req->error_loc = offsetof(struct nvme_rw_command, length);
547 status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
548 goto out;
549 }
550
551 if (!req->sg_cnt) {
552 nvmet_req_complete(req, 0);
553 return;
554 }
555
556 if (sect >= get_capacity(req->ns->bdev->bd_disk)) {
557 req->error_loc = offsetof(struct nvme_rw_command, slba);
558 status = NVME_SC_LBA_RANGE | NVME_STATUS_DNR;
559 goto out;
560 }
561
562 if (sect & (bdev_zone_sectors(req->ns->bdev) - 1)) {
563 req->error_loc = offsetof(struct nvme_rw_command, slba);
564 status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
565 goto out;
566 }
567
568 if (nvmet_use_inline_bvec(req)) {
569 bio = &req->z.inline_bio;
570 bio_init(bio, req->ns->bdev, req->inline_bvec,
571 ARRAY_SIZE(req->inline_bvec), opf);
572 } else {
573 bio = bio_alloc(req->ns->bdev, req->sg_cnt, opf, GFP_KERNEL);
574 }
575
576 bio->bi_end_io = nvmet_bdev_zone_append_bio_done;
577 bio->bi_iter.bi_sector = sect;
578 bio->bi_private = req;
579 if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
580 bio->bi_opf |= REQ_FUA;
581
582 for_each_sg(req->sg, sg, req->sg_cnt, sg_cnt) {
583 unsigned int len = sg->length;
584
585 if (bio_add_page(bio, sg_page(sg), len, sg->offset) != len) {
586 status = NVME_SC_INTERNAL;
587 goto out_put_bio;
588 }
589 total_len += len;
590 }
591
592 if (total_len != data_len) {
593 status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
594 goto out_put_bio;
595 }
596
597 submit_bio(bio);
598 return;
599
600out_put_bio:
601 nvmet_req_bio_put(req, bio);
602out:
603 nvmet_req_complete(req, status);
604}
605
606u16 nvmet_bdev_zns_parse_io_cmd(struct nvmet_req *req)
607{
608 struct nvme_command *cmd = req->cmd;
609
610 switch (cmd->common.opcode) {
611 case nvme_cmd_zone_append:
612 req->execute = nvmet_bdev_execute_zone_append;
613 return 0;
614 case nvme_cmd_zone_mgmt_recv:
615 req->execute = nvmet_bdev_execute_zone_mgmt_recv;
616 return 0;
617 case nvme_cmd_zone_mgmt_send:
618 req->execute = nvmet_bdev_execute_zone_mgmt_send;
619 return 0;
620 default:
621 return nvmet_bdev_parse_io_cmd(req);
622 }
623}