Merge tag 'for-linus-20190715' of git://git.kernel.dk/linux-block

+1 -1

Documentation/admin-guide/cgroup-v2.rst

··· 2124 2124 a queue (device) has been associated with the bio and 2125 2125 before submission. 2126 2126 2127 - wbc_account_io(@wbc, @page, @bytes) 2127 + wbc_account_cgroup_owner(@wbc, @page, @bytes) 2128 2128 Should be called for each data segment being written out. 2129 2129 While this function doesn't care exactly when it's called 2130 2130 during the writeback session, it's the easiest and most

-5

Documentation/block/biodoc.txt

··· 843 843 844 844 elevator_completed_req_fn called when a request is completed. 845 845 846 - elevator_may_queue_fn returns true if the scheduler wants to allow the 847 - current context to queue a new request even if 848 - it is over the queue limit. This must be used 849 - very carefully!! 850 - 851 846 elevator_set_req_fn 852 847 elevator_put_req_fn Must be used to allocate and free any elevator 853 848 specific storage for a request.

+13

MAINTAINERS

··· 4183 4183 F: mm/memcontrol.c 4184 4184 F: mm/swap_cgroup.c 4185 4185 4186 + CONTROL GROUP - BLOCK IO CONTROLLER (BLKIO) 4187 + M: Tejun Heo <tj@kernel.org> 4188 + M: Jens Axboe <axboe@kernel.dk> 4189 + L: cgroups@vger.kernel.org 4190 + L: linux-block@vger.kernel.org 4191 + T: git git://git.kernel.dk/linux-block 4192 + F: Documentation/cgroup-v1/blkio-controller.rst 4193 + F: block/blk-cgroup.c 4194 + F: include/linux/blk-cgroup.h 4195 + F: block/blk-throttle.c 4196 + F: block/blk-iolatency.c 4197 + F: block/bfq-cgroup.c 4198 + 4186 4199 CORETEMP HARDWARE MONITORING DRIVER 4187 4200 M: Fenghua Yu <fenghua.yu@intel.com> 4188 4201 L: linux-hwmon@vger.kernel.org

+6 -2

block/bio-integrity.c

··· 276 276 ret = bio_integrity_add_page(bio, virt_to_page(buf), 277 277 bytes, offset); 278 278 279 - if (ret == 0) 280 - return false; 279 + if (ret == 0) { 280 + printk(KERN_ERR "could not attach integrity payload\n"); 281 + kfree(buf); 282 + status = BLK_STS_RESOURCE; 283 + goto err_end_io; 284 + } 281 285 282 286 if (ret < bytes) 283 287 break;

+27 -1

block/bio.c

··· 16 16 #include <linux/workqueue.h> 17 17 #include <linux/cgroup.h> 18 18 #include <linux/blk-cgroup.h> 19 + #include <linux/highmem.h> 19 20 20 21 #include <trace/events/block.h> 21 22 #include "blk.h" ··· 1442 1441 bio_put(bio); 1443 1442 } 1444 1443 1444 + static void bio_invalidate_vmalloc_pages(struct bio *bio) 1445 + { 1446 + #ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE 1447 + if (bio->bi_private && !op_is_write(bio_op(bio))) { 1448 + unsigned long i, len = 0; 1449 + 1450 + for (i = 0; i < bio->bi_vcnt; i++) 1451 + len += bio->bi_io_vec[i].bv_len; 1452 + invalidate_kernel_vmap_range(bio->bi_private, len); 1453 + } 1454 + #endif 1455 + } 1456 + 1445 1457 static void bio_map_kern_endio(struct bio *bio) 1446 1458 { 1459 + bio_invalidate_vmalloc_pages(bio); 1447 1460 bio_put(bio); 1448 1461 } 1449 1462 ··· 1478 1463 unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1479 1464 unsigned long start = kaddr >> PAGE_SHIFT; 1480 1465 const int nr_pages = end - start; 1466 + bool is_vmalloc = is_vmalloc_addr(data); 1467 + struct page *page; 1481 1468 int offset, i; 1482 1469 struct bio *bio; 1483 1470 1484 1471 bio = bio_kmalloc(gfp_mask, nr_pages); 1485 1472 if (!bio) 1486 1473 return ERR_PTR(-ENOMEM); 1474 + 1475 + if (is_vmalloc) { 1476 + flush_kernel_vmap_range(data, len); 1477 + bio->bi_private = data; 1478 + } 1487 1479 1488 1480 offset = offset_in_page(kaddr); 1489 1481 for (i = 0; i < nr_pages; i++) { ··· 1502 1480 if (bytes > len) 1503 1481 bytes = len; 1504 1482 1505 - if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, 1483 + if (!is_vmalloc) 1484 + page = virt_to_page(data); 1485 + else 1486 + page = vmalloc_to_page(data); 1487 + if (bio_add_pc_page(q, bio, page, bytes, 1506 1488 offset) < bytes) { 1507 1489 /* we don't support partial mappings */ 1508 1490 bio_put(bio);

+61 -5

block/blk-cgroup.c

··· 29 29 #include <linux/ctype.h> 30 30 #include <linux/blk-cgroup.h> 31 31 #include <linux/tracehook.h> 32 + #include <linux/psi.h> 32 33 #include "blk.h" 33 34 34 35 #define MAX_KEY_LEN 100 ··· 48 47 EXPORT_SYMBOL_GPL(blkcg_root); 49 48 50 49 struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; 50 + EXPORT_SYMBOL_GPL(blkcg_root_css); 51 51 52 52 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; 53 53 54 54 static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ 55 55 56 56 static bool blkcg_debug_stats = false; 57 + static struct workqueue_struct *blkcg_punt_bio_wq; 57 58 58 59 static bool blkcg_policy_enabled(struct request_queue *q, 59 60 const struct blkcg_policy *pol) ··· 90 87 { 91 88 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); 92 89 90 + WARN_ON(!bio_list_empty(&blkg->async_bios)); 91 + 93 92 /* release the blkcg and parent blkg refs this blkg has been holding */ 94 93 css_put(&blkg->blkcg->css); 95 94 if (blkg->parent) ··· 115 110 struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt); 116 111 117 112 call_rcu(&blkg->rcu_head, __blkg_release); 113 + } 114 + 115 + static void blkg_async_bio_workfn(struct work_struct *work) 116 + { 117 + struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, 118 + async_bio_work); 119 + struct bio_list bios = BIO_EMPTY_LIST; 120 + struct bio *bio; 121 + 122 + /* as long as there are pending bios, @blkg can't go away */ 123 + spin_lock_bh(&blkg->async_bio_lock); 124 + bio_list_merge(&bios, &blkg->async_bios); 125 + bio_list_init(&blkg->async_bios); 126 + spin_unlock_bh(&blkg->async_bio_lock); 127 + 128 + while ((bio = bio_list_pop(&bios))) 129 + submit_bio(bio); 118 130 } 119 131 120 132 /** ··· 162 140 163 141 blkg->q = q; 164 142 INIT_LIST_HEAD(&blkg->q_node); 143 + spin_lock_init(&blkg->async_bio_lock); 144 + bio_list_init(&blkg->async_bios); 145 + INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); 165 146 blkg->blkcg = blkcg; 166 147 167 148 for (i = 0; i < BLKCG_MAX_POLS; i++) { ··· 1551 1526 } 1552 1527 EXPORT_SYMBOL_GPL(blkcg_policy_unregister); 1553 1528 1529 + bool __blkcg_punt_bio_submit(struct bio *bio) 1530 + { 1531 + struct blkcg_gq *blkg = bio->bi_blkg; 1532 + 1533 + /* consume the flag first */ 1534 + bio->bi_opf &= ~REQ_CGROUP_PUNT; 1535 + 1536 + /* never bounce for the root cgroup */ 1537 + if (!blkg->parent) 1538 + return false; 1539 + 1540 + spin_lock_bh(&blkg->async_bio_lock); 1541 + bio_list_add(&blkg->async_bios, bio); 1542 + spin_unlock_bh(&blkg->async_bio_lock); 1543 + 1544 + queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); 1545 + return true; 1546 + } 1547 + 1554 1548 /* 1555 1549 * Scale the accumulated delay based on how long it has been since we updated 1556 1550 * the delay. We only call this when we are adding delay, in case it's been a ··· 1631 1587 */ 1632 1588 static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) 1633 1589 { 1590 + unsigned long pflags; 1634 1591 u64 now = ktime_to_ns(ktime_get()); 1635 1592 u64 exp; 1636 1593 u64 delay_nsec = 0; ··· 1658 1613 */ 1659 1614 delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); 1660 1615 1661 - /* 1662 - * TODO: the use_memdelay flag is going to be for the upcoming psi stuff 1663 - * that hasn't landed upstream yet. Once that stuff is in place we need 1664 - * to do a psi_memstall_enter/leave if memdelay is set. 1665 - */ 1616 + if (use_memdelay) 1617 + psi_memstall_enter(&pflags); 1666 1618 1667 1619 exp = ktime_add_ns(now, delay_nsec); 1668 1620 tok = io_schedule_prepare(); ··· 1669 1627 break; 1670 1628 } while (!fatal_signal_pending(current)); 1671 1629 io_schedule_finish(tok); 1630 + 1631 + if (use_memdelay) 1632 + psi_memstall_leave(&pflags); 1672 1633 } 1673 1634 1674 1635 /** ··· 1770 1725 blkcg_scale_delay(blkg, now); 1771 1726 atomic64_add(delta, &blkg->delay_nsec); 1772 1727 } 1728 + 1729 + static int __init blkcg_init(void) 1730 + { 1731 + blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", 1732 + WQ_MEM_RECLAIM | WQ_FREEZABLE | 1733 + WQ_UNBOUND | WQ_SYSFS, 0); 1734 + if (!blkcg_punt_bio_wq) 1735 + return -ENOMEM; 1736 + return 0; 1737 + } 1738 + subsys_initcall(blkcg_init); 1773 1739 1774 1740 module_param(blkcg_debug_stats, bool, 0644); 1775 1741 MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");

+5 -1

block/blk-core.c

··· 117 117 rq->internal_tag = -1; 118 118 rq->start_time_ns = ktime_get_ns(); 119 119 rq->part = NULL; 120 + refcount_set(&rq->ref, 1); 120 121 } 121 122 EXPORT_SYMBOL(blk_rq_init); 122 123 ··· 688 687 struct request *rq; 689 688 struct list_head *plug_list; 690 689 691 - plug = current->plug; 690 + plug = blk_mq_plug(q, bio); 692 691 if (!plug) 693 692 return false; 694 693 ··· 1128 1127 */ 1129 1128 blk_qc_t submit_bio(struct bio *bio) 1130 1129 { 1130 + if (blkcg_punt_bio_submit(bio)) 1131 + return BLK_QC_T_NONE; 1132 + 1131 1133 /* 1132 1134 * If it's a regular read/write or a barrier with data attached, 1133 1135 * go through the normal accounting stuff before submission.

+1 -1

block/blk-mq.c

··· 1973 1973 1974 1974 blk_mq_bio_to_request(rq, bio, nr_segs); 1975 1975 1976 - plug = current->plug; 1976 + plug = blk_mq_plug(q, bio); 1977 1977 if (unlikely(is_flush_fua)) { 1978 1978 /* bypass scheduler for flush rq */ 1979 1979 blk_insert_flush(rq);

+32

block/blk-mq.h

··· 233 233 qmap->mq_map[cpu] = 0; 234 234 } 235 235 236 + /* 237 + * blk_mq_plug() - Get caller context plug 238 + * @q: request queue 239 + * @bio : the bio being submitted by the caller context 240 + * 241 + * Plugging, by design, may delay the insertion of BIOs into the elevator in 242 + * order to increase BIO merging opportunities. This however can cause BIO 243 + * insertion order to change from the order in which submit_bio() is being 244 + * executed in the case of multiple contexts concurrently issuing BIOs to a 245 + * device, even if these context are synchronized to tightly control BIO issuing 246 + * order. While this is not a problem with regular block devices, this ordering 247 + * change can cause write BIO failures with zoned block devices as these 248 + * require sequential write patterns to zones. Prevent this from happening by 249 + * ignoring the plug state of a BIO issuing context if the target request queue 250 + * is for a zoned block device and the BIO to plug is a write operation. 251 + * 252 + * Return current->plug if the bio can be plugged and NULL otherwise 253 + */ 254 + static inline struct blk_plug *blk_mq_plug(struct request_queue *q, 255 + struct bio *bio) 256 + { 257 + /* 258 + * For regular block devices or read operations, use the context plug 259 + * which may be NULL if blk_start_plug() was not executed. 260 + */ 261 + if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio))) 262 + return current->plug; 263 + 264 + /* Zoned block device write operation case: do not plug the BIO */ 265 + return NULL; 266 + } 267 + 236 268 #endif

+3 -6

block/blk-throttle.c

··· 881 881 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 882 882 u64 tmp; 883 883 884 - jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 884 + jiffy_elapsed = jiffies - tg->slice_start[rw]; 885 885 886 - /* Slice has just started. Consider one slice interval */ 887 - if (!jiffy_elapsed) 888 - jiffy_elapsed_rnd = tg->td->throtl_slice; 889 - 890 - jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); 886 + /* Round up to the next throttle slice, wait time must be nonzero */ 887 + jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); 891 888 892 889 /* 893 890 * jiffy_elapsed_rnd should not be a big value as minimum iops can be

+39 -28

block/blk-zoned.c

··· 14 14 #include <linux/rbtree.h> 15 15 #include <linux/blkdev.h> 16 16 #include <linux/blk-mq.h> 17 + #include <linux/mm.h> 18 + #include <linux/vmalloc.h> 19 + #include <linux/sched/mm.h> 17 20 18 21 #include "blk.h" 19 22 ··· 73 70 static inline unsigned int __blkdev_nr_zones(struct request_queue *q, 74 71 sector_t nr_sectors) 75 72 { 76 - unsigned long zone_sectors = blk_queue_zone_sectors(q); 73 + sector_t zone_sectors = blk_queue_zone_sectors(q); 77 74 78 75 return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors); 79 76 } ··· 120 117 } 121 118 122 119 static int blk_report_zones(struct gendisk *disk, sector_t sector, 123 - struct blk_zone *zones, unsigned int *nr_zones, 124 - gfp_t gfp_mask) 120 + struct blk_zone *zones, unsigned int *nr_zones) 125 121 { 126 122 struct request_queue *q = disk->queue; 127 123 unsigned int z = 0, n, nrz = *nr_zones; ··· 129 127 130 128 while (z < nrz && sector < capacity) { 131 129 n = nrz - z; 132 - ret = disk->fops->report_zones(disk, sector, &zones[z], &n, 133 - gfp_mask); 130 + ret = disk->fops->report_zones(disk, sector, &zones[z], &n); 134 131 if (ret) 135 132 return ret; 136 133 if (!n) ··· 150 149 * @sector: Sector from which to report zones 151 150 * @zones: Array of zone structures where to return the zones information 152 151 * @nr_zones: Number of zone structures in the zone array 153 - * @gfp_mask: Memory allocation flags (for bio_alloc) 154 152 * 155 153 * Description: 156 154 * Get zone information starting from the zone containing @sector. 157 155 * The number of zone information reported may be less than the number 158 156 * requested by @nr_zones. The number of zones actually reported is 159 157 * returned in @nr_zones. 158 + * The caller must use memalloc_noXX_save/restore() calls to control 159 + * memory allocations done within this function (zone array and command 160 + * buffer allocation by the device driver). 160 161 */ 161 162 int blkdev_report_zones(struct block_device *bdev, sector_t sector, 162 - struct blk_zone *zones, unsigned int *nr_zones, 163 - gfp_t gfp_mask) 163 + struct blk_zone *zones, unsigned int *nr_zones) 164 164 { 165 165 struct request_queue *q = bdev_get_queue(bdev); 166 166 unsigned int i, nrz; ··· 186 184 nrz = min(*nr_zones, 187 185 __blkdev_nr_zones(q, bdev->bd_part->nr_sects - sector)); 188 186 ret = blk_report_zones(bdev->bd_disk, get_start_sect(bdev) + sector, 189 - zones, &nrz, gfp_mask); 187 + zones, &nrz); 190 188 if (ret) 191 189 return ret; 192 190 ··· 307 305 if (!zones) 308 306 return -ENOMEM; 309 307 310 - ret = blkdev_report_zones(bdev, rep.sector, 311 - zones, &rep.nr_zones, 312 - GFP_KERNEL); 308 + ret = blkdev_report_zones(bdev, rep.sector, zones, &rep.nr_zones); 313 309 if (ret) 314 310 goto out; 315 311 ··· 373 373 * Allocate an array of struct blk_zone to get nr_zones zone information. 374 374 * The allocated array may be smaller than nr_zones. 375 375 */ 376 - static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones) 376 + static struct blk_zone *blk_alloc_zones(unsigned int *nr_zones) 377 377 { 378 - size_t size = *nr_zones * sizeof(struct blk_zone); 379 - struct page *page; 380 - int order; 378 + struct blk_zone *zones; 379 + size_t nrz = min(*nr_zones, BLK_ZONED_REPORT_MAX_ZONES); 381 380 382 - for (order = get_order(size); order >= 0; order--) { 383 - page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order); 384 - if (page) { 385 - *nr_zones = min_t(unsigned int, *nr_zones, 386 - (PAGE_SIZE << order) / sizeof(struct blk_zone)); 387 - return page_address(page); 388 - } 381 + /* 382 + * GFP_KERNEL here is meaningless as the caller task context has 383 + * the PF_MEMALLOC_NOIO flag set in blk_revalidate_disk_zones() 384 + * with memalloc_noio_save(). 385 + */ 386 + zones = kvcalloc(nrz, sizeof(struct blk_zone), GFP_KERNEL); 387 + if (!zones) { 388 + *nr_zones = 0; 389 + return NULL; 389 390 } 390 391 391 - return NULL; 392 + *nr_zones = nrz; 393 + 394 + return zones; 392 395 } 393 396 394 397 void blk_queue_free_zone_bitmaps(struct request_queue *q) ··· 418 415 unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL; 419 416 unsigned int i, rep_nr_zones = 0, z = 0, nrz; 420 417 struct blk_zone *zones = NULL; 418 + unsigned int noio_flag; 421 419 sector_t sector = 0; 422 420 int ret = 0; 423 421 ··· 430 426 q->nr_zones = nr_zones; 431 427 return 0; 432 428 } 429 + 430 + /* 431 + * Ensure that all memory allocations in this context are done as 432 + * if GFP_NOIO was specified. 433 + */ 434 + noio_flag = memalloc_noio_save(); 433 435 434 436 if (!blk_queue_is_zoned(q) || !nr_zones) { 435 437 nr_zones = 0; ··· 453 443 454 444 /* Get zone information and initialize seq_zones_bitmap */ 455 445 rep_nr_zones = nr_zones; 456 - zones = blk_alloc_zones(q->node, &rep_nr_zones); 446 + zones = blk_alloc_zones(&rep_nr_zones); 457 447 if (!zones) 458 448 goto out; 459 449 460 450 while (z < nr_zones) { 461 451 nrz = min(nr_zones - z, rep_nr_zones); 462 - ret = blk_report_zones(disk, sector, zones, &nrz, GFP_NOIO); 452 + ret = blk_report_zones(disk, sector, zones, &nrz); 463 453 if (ret) 464 454 goto out; 465 455 if (!nrz) ··· 490 480 blk_mq_unfreeze_queue(q); 491 481 492 482 out: 493 - free_pages((unsigned long)zones, 494 - get_order(rep_nr_zones * sizeof(struct blk_zone))); 483 + memalloc_noio_restore(noio_flag); 484 + 485 + kvfree(zones); 495 486 kfree(seq_zones_wlock); 496 487 kfree(seq_zones_bitmap); 497 488

+46 -13

drivers/block/nbd.c

··· 134 134 135 135 #define NBD_MAGIC 0x68797548 136 136 137 + #define NBD_DEF_BLKSIZE 1024 138 + 137 139 static unsigned int nbds_max = 16; 138 140 static int max_part = 16; 139 141 static struct workqueue_struct *recv_workqueue; ··· 1238 1236 nbd_config_put(nbd); 1239 1237 } 1240 1238 1239 + static bool nbd_is_valid_blksize(unsigned long blksize) 1240 + { 1241 + if (!blksize || !is_power_of_2(blksize) || blksize < 512 || 1242 + blksize > PAGE_SIZE) 1243 + return false; 1244 + return true; 1245 + } 1246 + 1241 1247 /* Must be called with config_lock held */ 1242 1248 static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, 1243 1249 unsigned int cmd, unsigned long arg) ··· 1261 1251 case NBD_SET_SOCK: 1262 1252 return nbd_add_socket(nbd, arg, false); 1263 1253 case NBD_SET_BLKSIZE: 1264 - if (!arg || !is_power_of_2(arg) || arg < 512 || 1265 - arg > PAGE_SIZE) 1254 + if (!arg) 1255 + arg = NBD_DEF_BLKSIZE; 1256 + if (!nbd_is_valid_blksize(arg)) 1266 1257 return -EINVAL; 1267 1258 nbd_size_set(nbd, arg, 1268 1259 div_s64(config->bytesize, arg)); ··· 1343 1332 atomic_set(&config->recv_threads, 0); 1344 1333 init_waitqueue_head(&config->recv_wq); 1345 1334 init_waitqueue_head(&config->conn_wait); 1346 - config->blksize = 1024; 1335 + config->blksize = NBD_DEF_BLKSIZE; 1347 1336 atomic_set(&config->live_connections, 0); 1348 1337 try_module_get(THIS_MODULE); 1349 1338 return config; ··· 1684 1673 [NBD_DEVICE_CONNECTED] = { .type = NLA_U8 }, 1685 1674 }; 1686 1675 1676 + static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd) 1677 + { 1678 + struct nbd_config *config = nbd->config; 1679 + u64 bsize = config->blksize; 1680 + u64 bytes = config->bytesize; 1681 + 1682 + if (info->attrs[NBD_ATTR_SIZE_BYTES]) 1683 + bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]); 1684 + 1685 + if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) { 1686 + bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]); 1687 + if (!bsize) 1688 + bsize = NBD_DEF_BLKSIZE; 1689 + if (!nbd_is_valid_blksize(bsize)) { 1690 + printk(KERN_ERR "Invalid block size %llu\n", bsize); 1691 + return -EINVAL; 1692 + } 1693 + } 1694 + 1695 + if (bytes != config->bytesize || bsize != config->blksize) 1696 + nbd_size_set(nbd, bsize, div64_u64(bytes, bsize)); 1697 + return 0; 1698 + } 1699 + 1687 1700 static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) 1688 1701 { 1689 1702 struct nbd_device *nbd = NULL; ··· 1795 1760 refcount_set(&nbd->config_refs, 1); 1796 1761 set_bit(NBD_BOUND, &config->runtime_flags); 1797 1762 1798 - if (info->attrs[NBD_ATTR_SIZE_BYTES]) { 1799 - u64 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]); 1800 - nbd_size_set(nbd, config->blksize, 1801 - div64_u64(bytes, config->blksize)); 1802 - } 1803 - if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) { 1804 - u64 bsize = 1805 - nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]); 1806 - nbd_size_set(nbd, bsize, div64_u64(config->bytesize, bsize)); 1807 - } 1763 + ret = nbd_genl_size_set(info, nbd); 1764 + if (ret) 1765 + goto out; 1766 + 1808 1767 if (info->attrs[NBD_ATTR_TIMEOUT]) { 1809 1768 u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]); 1810 1769 nbd->tag_set.timeout = timeout * HZ; ··· 1966 1937 ret = -EINVAL; 1967 1938 goto out; 1968 1939 } 1940 + 1941 + ret = nbd_genl_size_set(info, nbd); 1942 + if (ret) 1943 + goto out; 1969 1944 1970 1945 if (info->attrs[NBD_ATTR_TIMEOUT]) { 1971 1946 u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);

+2 -3

drivers/block/null_blk.h

··· 89 89 int null_zone_init(struct nullb_device *dev); 90 90 void null_zone_exit(struct nullb_device *dev); 91 91 int null_zone_report(struct gendisk *disk, sector_t sector, 92 - struct blk_zone *zones, unsigned int *nr_zones, 93 - gfp_t gfp_mask); 92 + struct blk_zone *zones, unsigned int *nr_zones); 94 93 void null_zone_write(struct nullb_cmd *cmd, sector_t sector, 95 94 unsigned int nr_sectors); 96 95 void null_zone_reset(struct nullb_cmd *cmd, sector_t sector); ··· 102 103 static inline void null_zone_exit(struct nullb_device *dev) {} 103 104 static inline int null_zone_report(struct gendisk *disk, sector_t sector, 104 105 struct blk_zone *zones, 105 - unsigned int *nr_zones, gfp_t gfp_mask) 106 + unsigned int *nr_zones) 106 107 { 107 108 return -EOPNOTSUPP; 108 109 }

+1 -2

drivers/block/null_blk_zoned.c

··· 67 67 } 68 68 69 69 int null_zone_report(struct gendisk *disk, sector_t sector, 70 - struct blk_zone *zones, unsigned int *nr_zones, 71 - gfp_t gfp_mask) 70 + struct blk_zone *zones, unsigned int *nr_zones) 72 71 { 73 72 struct nullb *nullb = disk->private_data; 74 73 struct nullb_device *dev = nullb->dev;

+2 -3

drivers/md/dm-flakey.c

··· 461 461 462 462 #ifdef CONFIG_BLK_DEV_ZONED 463 463 static int flakey_report_zones(struct dm_target *ti, sector_t sector, 464 - struct blk_zone *zones, unsigned int *nr_zones, 465 - gfp_t gfp_mask) 464 + struct blk_zone *zones, unsigned int *nr_zones) 466 465 { 467 466 struct flakey_c *fc = ti->private; 468 467 int ret; 469 468 470 469 /* Do report and remap it */ 471 470 ret = blkdev_report_zones(fc->dev->bdev, flakey_map_sector(ti, sector), 472 - zones, nr_zones, gfp_mask); 471 + zones, nr_zones); 473 472 if (ret != 0) 474 473 return ret; 475 474

+2 -3

drivers/md/dm-linear.c

··· 137 137 138 138 #ifdef CONFIG_BLK_DEV_ZONED 139 139 static int linear_report_zones(struct dm_target *ti, sector_t sector, 140 - struct blk_zone *zones, unsigned int *nr_zones, 141 - gfp_t gfp_mask) 140 + struct blk_zone *zones, unsigned int *nr_zones) 142 141 { 143 142 struct linear_c *lc = (struct linear_c *) ti->private; 144 143 int ret; 145 144 146 145 /* Do report and remap it */ 147 146 ret = blkdev_report_zones(lc->dev->bdev, linear_map_sector(ti, sector), 148 - zones, nr_zones, gfp_mask); 147 + zones, nr_zones); 149 148 if (ret != 0) 150 149 return ret; 151 150

+12 -4

drivers/md/dm-zoned-metadata.c

··· 8 8 9 9 #include <linux/module.h> 10 10 #include <linux/crc32.h> 11 + #include <linux/sched/mm.h> 11 12 12 13 #define DM_MSG_PREFIX "zoned metadata" 13 14 ··· 1163 1162 while (sector < dev->capacity) { 1164 1163 /* Get zone information */ 1165 1164 nr_blkz = DMZ_REPORT_NR_ZONES; 1166 - ret = blkdev_report_zones(dev->bdev, sector, blkz, 1167 - &nr_blkz, GFP_KERNEL); 1165 + ret = blkdev_report_zones(dev->bdev, sector, blkz, &nr_blkz); 1168 1166 if (ret) { 1169 1167 dmz_dev_err(dev, "Report zones failed %d", ret); 1170 1168 goto out; ··· 1201 1201 static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1202 1202 { 1203 1203 unsigned int nr_blkz = 1; 1204 + unsigned int noio_flag; 1204 1205 struct blk_zone blkz; 1205 1206 int ret; 1206 1207 1207 - /* Get zone information from disk */ 1208 + /* 1209 + * Get zone information from disk. Since blkdev_report_zones() uses 1210 + * GFP_KERNEL by default for memory allocations, set the per-task 1211 + * PF_MEMALLOC_NOIO flag so that all allocations are done as if 1212 + * GFP_NOIO was specified. 1213 + */ 1214 + noio_flag = memalloc_noio_save(); 1208 1215 ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), 1209 - &blkz, &nr_blkz, GFP_NOIO); 1216 + &blkz, &nr_blkz); 1217 + memalloc_noio_restore(noio_flag); 1210 1218 if (!nr_blkz) 1211 1219 ret = -EIO; 1212 1220 if (ret) {

+2 -4

drivers/md/dm.c

··· 441 441 } 442 442 443 443 static int dm_blk_report_zones(struct gendisk *disk, sector_t sector, 444 - struct blk_zone *zones, unsigned int *nr_zones, 445 - gfp_t gfp_mask) 444 + struct blk_zone *zones, unsigned int *nr_zones) 446 445 { 447 446 #ifdef CONFIG_BLK_DEV_ZONED 448 447 struct mapped_device *md = disk->private_data; ··· 479 480 * So there is no need to loop here trying to fill the entire array 480 481 * of zones. 481 482 */ 482 - ret = tgt->type->report_zones(tgt, sector, zones, 483 - nr_zones, gfp_mask); 483 + ret = tgt->type->report_zones(tgt, sector, zones, nr_zones); 484 484 485 485 out: 486 486 dm_put_live_table(md, srcu_idx);

+39 -4

drivers/nvme/host/core.c

··· 11 11 #include <linux/hdreg.h> 12 12 #include <linux/kernel.h> 13 13 #include <linux/module.h> 14 + #include <linux/backing-dev.h> 14 15 #include <linux/list_sort.h> 15 16 #include <linux/slab.h> 16 17 #include <linux/types.h> ··· 1627 1626 { 1628 1627 sector_t capacity = le64_to_cpu(id->nsze) << (ns->lba_shift - 9); 1629 1628 unsigned short bs = 1 << ns->lba_shift; 1629 + u32 atomic_bs, phys_bs, io_opt; 1630 1630 1631 1631 if (ns->lba_shift > PAGE_SHIFT) { 1632 1632 /* unsupported block size, set capacity to 0 later */ ··· 1636 1634 blk_mq_freeze_queue(disk->queue); 1637 1635 blk_integrity_unregister(disk); 1638 1636 1637 + if (id->nabo == 0) { 1638 + /* 1639 + * Bit 1 indicates whether NAWUPF is defined for this namespace 1640 + * and whether it should be used instead of AWUPF. If NAWUPF == 1641 + * 0 then AWUPF must be used instead. 1642 + */ 1643 + if (id->nsfeat & (1 << 1) && id->nawupf) 1644 + atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; 1645 + else 1646 + atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; 1647 + } else { 1648 + atomic_bs = bs; 1649 + } 1650 + phys_bs = bs; 1651 + io_opt = bs; 1652 + if (id->nsfeat & (1 << 4)) { 1653 + /* NPWG = Namespace Preferred Write Granularity */ 1654 + phys_bs *= 1 + le16_to_cpu(id->npwg); 1655 + /* NOWS = Namespace Optimal Write Size */ 1656 + io_opt *= 1 + le16_to_cpu(id->nows); 1657 + } 1658 + 1639 1659 blk_queue_logical_block_size(disk->queue, bs); 1640 - blk_queue_physical_block_size(disk->queue, bs); 1641 - blk_queue_io_min(disk->queue, bs); 1660 + /* 1661 + * Linux filesystems assume writing a single physical block is 1662 + * an atomic operation. Hence limit the physical block size to the 1663 + * value of the Atomic Write Unit Power Fail parameter. 1664 + */ 1665 + blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs)); 1666 + blk_queue_io_min(disk->queue, phys_bs); 1667 + blk_queue_io_opt(disk->queue, io_opt); 1642 1668 1643 1669 if (ns->ms && !ns->ext && 1644 1670 (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) ··· 2416 2386 lockdep_assert_held(&nvme_subsystems_lock); 2417 2387 2418 2388 list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) { 2419 - if (ctrl->state == NVME_CTRL_DELETING || 2420 - ctrl->state == NVME_CTRL_DEAD) 2389 + if (tmp->state == NVME_CTRL_DELETING || 2390 + tmp->state == NVME_CTRL_DEAD) 2421 2391 continue; 2422 2392 2423 2393 if (tmp->cntlid == ctrl->cntlid) { ··· 2463 2433 memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); 2464 2434 subsys->vendor_id = le16_to_cpu(id->vid); 2465 2435 subsys->cmic = id->cmic; 2436 + subsys->awupf = le16_to_cpu(id->awupf); 2466 2437 #ifdef CONFIG_NVME_MULTIPATH 2467 2438 subsys->iopolicy = NVME_IOPOLICY_NUMA; 2468 2439 #endif ··· 3304 3273 ret = PTR_ERR(ns->queue); 3305 3274 goto out_free_ns; 3306 3275 } 3276 + 3277 + if (ctrl->opts && ctrl->opts->data_digest) 3278 + ns->queue->backing_dev_info->capabilities 3279 + |= BDI_CAP_STABLE_WRITES; 3307 3280 3308 3281 blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); 3309 3282 if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)

+48 -3

drivers/nvme/host/fc.c

··· 204 204 205 205 static struct workqueue_struct *nvme_fc_wq; 206 206 207 + static bool nvme_fc_waiting_to_unload; 208 + static DECLARE_COMPLETION(nvme_fc_unload_proceed); 209 + 207 210 /* 208 211 * These items are short-term. They will eventually be moved into 209 212 * a generic FC class. See comments in module init. ··· 232 229 /* remove from transport list */ 233 230 spin_lock_irqsave(&nvme_fc_lock, flags); 234 231 list_del(&lport->port_list); 232 + if (nvme_fc_waiting_to_unload && list_empty(&nvme_fc_lport_list)) 233 + complete(&nvme_fc_unload_proceed); 235 234 spin_unlock_irqrestore(&nvme_fc_lock, flags); 236 235 237 236 ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num); ··· 3462 3457 return ret; 3463 3458 } 3464 3459 3460 + static void 3461 + nvme_fc_delete_controllers(struct nvme_fc_rport *rport) 3462 + { 3463 + struct nvme_fc_ctrl *ctrl; 3464 + 3465 + spin_lock(&rport->lock); 3466 + list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) { 3467 + dev_warn(ctrl->ctrl.device, 3468 + "NVME-FC{%d}: transport unloading: deleting ctrl\n", 3469 + ctrl->cnum); 3470 + nvme_delete_ctrl(&ctrl->ctrl); 3471 + } 3472 + spin_unlock(&rport->lock); 3473 + } 3474 + 3475 + static void 3476 + nvme_fc_cleanup_for_unload(void) 3477 + { 3478 + struct nvme_fc_lport *lport; 3479 + struct nvme_fc_rport *rport; 3480 + 3481 + list_for_each_entry(lport, &nvme_fc_lport_list, port_list) { 3482 + list_for_each_entry(rport, &lport->endp_list, endp_list) { 3483 + nvme_fc_delete_controllers(rport); 3484 + } 3485 + } 3486 + } 3487 + 3465 3488 static void __exit nvme_fc_exit_module(void) 3466 3489 { 3467 - /* sanity check - all lports should be removed */ 3468 - if (!list_empty(&nvme_fc_lport_list)) 3469 - pr_warn("%s: localport list not empty\n", __func__); 3490 + unsigned long flags; 3491 + bool need_cleanup = false; 3492 + 3493 + spin_lock_irqsave(&nvme_fc_lock, flags); 3494 + nvme_fc_waiting_to_unload = true; 3495 + if (!list_empty(&nvme_fc_lport_list)) { 3496 + need_cleanup = true; 3497 + nvme_fc_cleanup_for_unload(); 3498 + } 3499 + spin_unlock_irqrestore(&nvme_fc_lock, flags); 3500 + if (need_cleanup) { 3501 + pr_info("%s: waiting for ctlr deletes\n", __func__); 3502 + wait_for_completion(&nvme_fc_unload_proceed); 3503 + pr_info("%s: ctrl deletes complete\n", __func__); 3504 + } 3470 3505 3471 3506 nvmf_unregister_transport(&nvme_fc_transport); 3472 3507

+13 -5

drivers/nvme/host/multipath.c

··· 123 123 } 124 124 } 125 125 126 + static bool nvme_path_is_disabled(struct nvme_ns *ns) 127 + { 128 + return ns->ctrl->state != NVME_CTRL_LIVE || 129 + test_bit(NVME_NS_ANA_PENDING, &ns->flags) || 130 + test_bit(NVME_NS_REMOVING, &ns->flags); 131 + } 132 + 126 133 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) 127 134 { 128 135 int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; 129 136 struct nvme_ns *found = NULL, *fallback = NULL, *ns; 130 137 131 138 list_for_each_entry_rcu(ns, &head->list, siblings) { 132 - if (ns->ctrl->state != NVME_CTRL_LIVE || 133 - test_bit(NVME_NS_ANA_PENDING, &ns->flags)) 139 + if (nvme_path_is_disabled(ns)) 134 140 continue; 135 141 136 142 if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) ··· 184 178 { 185 179 struct nvme_ns *ns, *found, *fallback = NULL; 186 180 187 - if (list_is_singular(&head->list)) 181 + if (list_is_singular(&head->list)) { 182 + if (nvme_path_is_disabled(old)) 183 + return NULL; 188 184 return old; 185 + } 189 186 190 187 for (ns = nvme_next_ns(head, old); 191 188 ns != old; 192 189 ns = nvme_next_ns(head, ns)) { 193 - if (ns->ctrl->state != NVME_CTRL_LIVE || 194 - test_bit(NVME_NS_ANA_PENDING, &ns->flags)) 190 + if (nvme_path_is_disabled(ns)) 195 191 continue; 196 192 197 193 if (ns->ana_state == NVME_ANA_OPTIMIZED) {

+1

drivers/nvme/host/nvme.h

··· 283 283 char firmware_rev[8]; 284 284 u8 cmic; 285 285 u16 vendor_id; 286 + u16 awupf; /* 0's based awupf value. */ 286 287 struct ida ns_ida; 287 288 #ifdef CONFIG_NVME_MULTIPATH 288 289 enum nvme_iopolicy iopolicy;

+16 -10

drivers/nvme/host/pci.c

··· 1439 1439 1440 1440 if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { 1441 1441 nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth)); 1442 - nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, 1443 - nvmeq->sq_cmds); 1444 - if (nvmeq->sq_dma_addr) { 1445 - set_bit(NVMEQ_SQ_CMB, &nvmeq->flags); 1446 - return 0; 1442 + if (nvmeq->sq_cmds) { 1443 + nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, 1444 + nvmeq->sq_cmds); 1445 + if (nvmeq->sq_dma_addr) { 1446 + set_bit(NVMEQ_SQ_CMB, &nvmeq->flags); 1447 + return 0; 1448 + } 1449 + 1450 + pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(depth)); 1447 1451 } 1448 1452 } 1449 1453 ··· 2254 2250 if (!dev->ctrl.tagset) { 2255 2251 dev->tagset.ops = &nvme_mq_ops; 2256 2252 dev->tagset.nr_hw_queues = dev->online_queues - 1; 2257 - dev->tagset.nr_maps = 2; /* default + read */ 2253 + dev->tagset.nr_maps = 1; /* default */ 2254 + if (dev->io_queues[HCTX_TYPE_READ]) 2255 + dev->tagset.nr_maps++; 2258 2256 if (dev->io_queues[HCTX_TYPE_POLL]) 2259 2257 dev->tagset.nr_maps++; 2260 2258 dev->tagset.timeout = NVME_IO_TIMEOUT; ··· 2295 2289 2296 2290 pci_set_master(pdev); 2297 2291 2298 - if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && 2299 - dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) 2292 + if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64))) 2300 2293 goto disable; 2301 2294 2302 2295 if (readl(dev->bar + NVME_REG_CSTS) == -1) { ··· 2503 2498 * Limit the max command size to prevent iod->sg allocations going 2504 2499 * over a single page. 2505 2500 */ 2506 - dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1; 2501 + dev->ctrl.max_hw_sectors = min_t(u32, 2502 + NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9); 2507 2503 dev->ctrl.max_segments = NVME_MAX_SEGS; 2508 2504 2509 2505 /* ··· 2929 2923 return 0; 2930 2924 } 2931 2925 2932 - const struct dev_pm_ops nvme_dev_pm_ops = { 2926 + static const struct dev_pm_ops nvme_dev_pm_ops = { 2933 2927 .suspend = nvme_suspend, 2934 2928 .resume = nvme_resume, 2935 2929 .freeze = nvme_simple_suspend,

+8 -1

drivers/nvme/host/tcp.c

··· 860 860 else 861 861 flags |= MSG_MORE; 862 862 863 - ret = kernel_sendpage(queue->sock, page, offset, len, flags); 863 + /* can't zcopy slab pages */ 864 + if (unlikely(PageSlab(page))) { 865 + ret = sock_no_sendpage(queue->sock, page, offset, len, 866 + flags); 867 + } else { 868 + ret = kernel_sendpage(queue->sock, page, offset, len, 869 + flags); 870 + } 864 871 if (ret <= 0) 865 872 return ret; 866 873

+27 -1

drivers/nvme/host/trace.c

··· 7 7 #include <asm/unaligned.h> 8 8 #include "trace.h" 9 9 10 + static const char *nvme_trace_delete_sq(struct trace_seq *p, u8 *cdw10) 11 + { 12 + const char *ret = trace_seq_buffer_ptr(p); 13 + u16 sqid = get_unaligned_le16(cdw10); 14 + 15 + trace_seq_printf(p, "sqid=%u", sqid); 16 + trace_seq_putc(p, 0); 17 + 18 + return ret; 19 + } 20 + 10 21 static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10) 11 22 { 12 23 const char *ret = trace_seq_buffer_ptr(p); ··· 29 18 30 19 trace_seq_printf(p, "sqid=%u, qsize=%u, sq_flags=0x%x, cqid=%u", 31 20 sqid, qsize, sq_flags, cqid); 21 + trace_seq_putc(p, 0); 22 + 23 + return ret; 24 + } 25 + 26 + static const char *nvme_trace_delete_cq(struct trace_seq *p, u8 *cdw10) 27 + { 28 + const char *ret = trace_seq_buffer_ptr(p); 29 + u16 cqid = get_unaligned_le16(cdw10); 30 + 31 + trace_seq_printf(p, "cqid=%u", cqid); 32 32 trace_seq_putc(p, 0); 33 33 34 34 return ret; ··· 129 107 u8 opcode, u8 *cdw10) 130 108 { 131 109 switch (opcode) { 110 + case nvme_admin_delete_sq: 111 + return nvme_trace_delete_sq(p, cdw10); 132 112 case nvme_admin_create_sq: 133 113 return nvme_trace_create_sq(p, cdw10); 114 + case nvme_admin_delete_cq: 115 + return nvme_trace_delete_cq(p, cdw10); 134 116 case nvme_admin_create_cq: 135 117 return nvme_trace_create_cq(p, cdw10); 136 118 case nvme_admin_identify: ··· 204 178 { 205 179 const char *ret = trace_seq_buffer_ptr(p); 206 180 207 - trace_seq_printf(p, "spcecific=%*ph", 24, spc); 181 + trace_seq_printf(p, "specific=%*ph", 24, spc); 208 182 trace_seq_putc(p, 0); 209 183 return ret; 210 184 }

+3

drivers/nvme/target/admin-cmd.c

··· 442 442 break; 443 443 } 444 444 445 + if (ns->bdev) 446 + nvmet_bdev_set_limits(ns->bdev, id); 447 + 445 448 /* 446 449 * We just provide a single LBA format that matches what the 447 450 * underlying device reports.

+3 -1

drivers/nvme/target/configfs.c

··· 588 588 goto out; 589 589 590 590 ret = -EINVAL; 591 - if (nsid == 0 || nsid == NVME_NSID_ALL) 591 + if (nsid == 0 || nsid == NVME_NSID_ALL) { 592 + pr_err("invalid nsid %#x", nsid); 592 593 goto out; 594 + } 593 595 594 596 ret = -ENOMEM; 595 597 ns = nvmet_ns_alloc(subsys, nsid);

+22 -22

drivers/nvme/target/fcloop.c

··· 434 434 int ret = 0; 435 435 bool aborted = false; 436 436 437 - spin_lock(&tfcp_req->reqlock); 437 + spin_lock_irq(&tfcp_req->reqlock); 438 438 switch (tfcp_req->inistate) { 439 439 case INI_IO_START: 440 440 tfcp_req->inistate = INI_IO_ACTIVE; ··· 443 443 aborted = true; 444 444 break; 445 445 default: 446 - spin_unlock(&tfcp_req->reqlock); 446 + spin_unlock_irq(&tfcp_req->reqlock); 447 447 WARN_ON(1); 448 448 return; 449 449 } 450 - spin_unlock(&tfcp_req->reqlock); 450 + spin_unlock_irq(&tfcp_req->reqlock); 451 451 452 452 if (unlikely(aborted)) 453 453 ret = -ECANCELED; ··· 469 469 struct nvmefc_fcp_req *fcpreq; 470 470 bool completed = false; 471 471 472 - spin_lock(&tfcp_req->reqlock); 472 + spin_lock_irq(&tfcp_req->reqlock); 473 473 fcpreq = tfcp_req->fcpreq; 474 474 switch (tfcp_req->inistate) { 475 475 case INI_IO_ABORTED: ··· 478 478 completed = true; 479 479 break; 480 480 default: 481 - spin_unlock(&tfcp_req->reqlock); 481 + spin_unlock_irq(&tfcp_req->reqlock); 482 482 WARN_ON(1); 483 483 return; 484 484 } 485 - spin_unlock(&tfcp_req->reqlock); 485 + spin_unlock_irq(&tfcp_req->reqlock); 486 486 487 487 if (unlikely(completed)) { 488 488 /* remove reference taken in original abort downcall */ ··· 494 494 nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport, 495 495 &tfcp_req->tgt_fcp_req); 496 496 497 - spin_lock(&tfcp_req->reqlock); 497 + spin_lock_irq(&tfcp_req->reqlock); 498 498 tfcp_req->fcpreq = NULL; 499 - spin_unlock(&tfcp_req->reqlock); 499 + spin_unlock_irq(&tfcp_req->reqlock); 500 500 501 501 fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED); 502 502 /* call_host_done releases reference for abort downcall */ ··· 513 513 container_of(work, struct fcloop_fcpreq, tio_done_work); 514 514 struct nvmefc_fcp_req *fcpreq; 515 515 516 - spin_lock(&tfcp_req->reqlock); 516 + spin_lock_irq(&tfcp_req->reqlock); 517 517 fcpreq = tfcp_req->fcpreq; 518 518 tfcp_req->inistate = INI_IO_COMPLETED; 519 - spin_unlock(&tfcp_req->reqlock); 519 + spin_unlock_irq(&tfcp_req->reqlock); 520 520 521 521 fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status); 522 522 } ··· 535 535 if (!rport->targetport) 536 536 return -ECONNREFUSED; 537 537 538 - tfcp_req = kzalloc(sizeof(*tfcp_req), GFP_KERNEL); 538 + tfcp_req = kzalloc(sizeof(*tfcp_req), GFP_ATOMIC); 539 539 if (!tfcp_req) 540 540 return -ENOMEM; 541 541 ··· 621 621 int fcp_err = 0, active, aborted; 622 622 u8 op = tgt_fcpreq->op; 623 623 624 - spin_lock(&tfcp_req->reqlock); 624 + spin_lock_irq(&tfcp_req->reqlock); 625 625 fcpreq = tfcp_req->fcpreq; 626 626 active = tfcp_req->active; 627 627 aborted = tfcp_req->aborted; 628 628 tfcp_req->active = true; 629 - spin_unlock(&tfcp_req->reqlock); 629 + spin_unlock_irq(&tfcp_req->reqlock); 630 630 631 631 if (unlikely(active)) 632 632 /* illegal - call while i/o active */ ··· 634 634 635 635 if (unlikely(aborted)) { 636 636 /* target transport has aborted i/o prior */ 637 - spin_lock(&tfcp_req->reqlock); 637 + spin_lock_irq(&tfcp_req->reqlock); 638 638 tfcp_req->active = false; 639 - spin_unlock(&tfcp_req->reqlock); 639 + spin_unlock_irq(&tfcp_req->reqlock); 640 640 tgt_fcpreq->transferred_length = 0; 641 641 tgt_fcpreq->fcp_error = -ECANCELED; 642 642 tgt_fcpreq->done(tgt_fcpreq); ··· 693 693 break; 694 694 } 695 695 696 - spin_lock(&tfcp_req->reqlock); 696 + spin_lock_irq(&tfcp_req->reqlock); 697 697 tfcp_req->active = false; 698 - spin_unlock(&tfcp_req->reqlock); 698 + spin_unlock_irq(&tfcp_req->reqlock); 699 699 700 700 tgt_fcpreq->transferred_length = xfrlen; 701 701 tgt_fcpreq->fcp_error = fcp_err; ··· 715 715 * (one doing io, other doing abort) and only kills ops posted 716 716 * after the abort request 717 717 */ 718 - spin_lock(&tfcp_req->reqlock); 718 + spin_lock_irq(&tfcp_req->reqlock); 719 719 tfcp_req->aborted = true; 720 - spin_unlock(&tfcp_req->reqlock); 720 + spin_unlock_irq(&tfcp_req->reqlock); 721 721 722 722 tfcp_req->status = NVME_SC_INTERNAL; 723 723 ··· 765 765 return; 766 766 767 767 /* break initiator/target relationship for io */ 768 - spin_lock(&tfcp_req->reqlock); 768 + spin_lock_irq(&tfcp_req->reqlock); 769 769 switch (tfcp_req->inistate) { 770 770 case INI_IO_START: 771 771 case INI_IO_ACTIVE: ··· 775 775 abortio = false; 776 776 break; 777 777 default: 778 - spin_unlock(&tfcp_req->reqlock); 778 + spin_unlock_irq(&tfcp_req->reqlock); 779 779 WARN_ON(1); 780 780 return; 781 781 } 782 - spin_unlock(&tfcp_req->reqlock); 782 + spin_unlock_irq(&tfcp_req->reqlock); 783 783 784 784 if (abortio) 785 785 /* leave the reference while the work item is scheduled */

+39

drivers/nvme/target/io-cmd-bdev.c

··· 8 8 #include <linux/module.h> 9 9 #include "nvmet.h" 10 10 11 + void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) 12 + { 13 + const struct queue_limits *ql = &bdev_get_queue(bdev)->limits; 14 + /* Number of physical blocks per logical block. */ 15 + const u32 ppl = ql->physical_block_size / ql->logical_block_size; 16 + /* Physical blocks per logical block, 0's based. */ 17 + const __le16 ppl0b = to0based(ppl); 18 + 19 + /* 20 + * For NVMe 1.2 and later, bit 1 indicates that the fields NAWUN, 21 + * NAWUPF, and NACWU are defined for this namespace and should be 22 + * used by the host for this namespace instead of the AWUN, AWUPF, 23 + * and ACWU fields in the Identify Controller data structure. If 24 + * any of these fields are zero that means that the corresponding 25 + * field from the identify controller data structure should be used. 26 + */ 27 + id->nsfeat |= 1 << 1; 28 + id->nawun = ppl0b; 29 + id->nawupf = ppl0b; 30 + id->nacwu = ppl0b; 31 + 32 + /* 33 + * Bit 4 indicates that the fields NPWG, NPWA, NPDG, NPDA, and 34 + * NOWS are defined for this namespace and should be used by 35 + * the host for I/O optimization. 36 + */ 37 + id->nsfeat |= 1 << 4; 38 + /* NPWG = Namespace Preferred Write Granularity. 0's based */ 39 + id->npwg = ppl0b; 40 + /* NPWA = Namespace Preferred Write Alignment. 0's based */ 41 + id->npwa = id->npwg; 42 + /* NPDG = Namespace Preferred Deallocate Granularity. 0's based */ 43 + id->npdg = to0based(ql->discard_granularity / ql->logical_block_size); 44 + /* NPDG = Namespace Preferred Deallocate Alignment */ 45 + id->npda = id->npdg; 46 + /* NOWS = Namespace Optimal Write Size */ 47 + id->nows = to0based(ql->io_opt / ql->logical_block_size); 48 + } 49 + 11 50 int nvmet_bdev_ns_enable(struct nvmet_ns *ns) 12 51 { 13 52 int ret;

+8

drivers/nvme/target/nvmet.h

··· 365 365 void nvmet_execute_async_event(struct nvmet_req *req); 366 366 367 367 u16 nvmet_parse_connect_cmd(struct nvmet_req *req); 368 + void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id); 368 369 u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req); 369 370 u16 nvmet_file_parse_io_cmd(struct nvmet_req *req); 370 371 u16 nvmet_parse_admin_cmd(struct nvmet_req *req); ··· 493 492 } 494 493 495 494 u16 errno_to_nvme_status(struct nvmet_req *req, int errno); 495 + 496 + /* Convert a 32-bit number to a 16-bit 0's based number */ 497 + static inline __le16 to0based(u32 a) 498 + { 499 + return cpu_to_le16(max(1U, min(1U << 16, a)) - 1); 500 + } 501 + 496 502 #endif /* _NVMET_H */

+1 -1

drivers/nvme/target/trace.c

··· 146 146 { 147 147 const char *ret = trace_seq_buffer_ptr(p); 148 148 149 - trace_seq_printf(p, "spcecific=%*ph", 24, spc); 149 + trace_seq_printf(p, "specific=%*ph", 24, spc); 150 150 trace_seq_putc(p, 0); 151 151 return ret; 152 152 }

+1 -2

drivers/scsi/sd.h

··· 213 213 extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, 214 214 struct scsi_sense_hdr *sshdr); 215 215 extern int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, 216 - struct blk_zone *zones, unsigned int *nr_zones, 217 - gfp_t gfp_mask); 216 + struct blk_zone *zones, unsigned int *nr_zones); 218 217 219 218 #else /* CONFIG_BLK_DEV_ZONED */ 220 219

+76 -32

drivers/scsi/sd_zbc.c

··· 9 9 */ 10 10 11 11 #include <linux/blkdev.h> 12 + #include <linux/vmalloc.h> 13 + #include <linux/sched/mm.h> 12 14 13 15 #include <asm/unaligned.h> 14 16 ··· 52 50 /** 53 51 * sd_zbc_do_report_zones - Issue a REPORT ZONES scsi command. 54 52 * @sdkp: The target disk 55 - * @buf: Buffer to use for the reply 53 + * @buf: vmalloc-ed buffer to use for the reply 56 54 * @buflen: the buffer size 57 55 * @lba: Start LBA of the report 58 56 * @partial: Do partial report ··· 81 79 put_unaligned_be32(buflen, &cmd[10]); 82 80 if (partial) 83 81 cmd[14] = ZBC_REPORT_ZONE_PARTIAL; 84 - memset(buf, 0, buflen); 85 82 86 83 result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE, 87 84 buf, buflen, &sshdr, ··· 104 103 return 0; 105 104 } 106 105 106 + /* 107 + * Maximum number of zones to get with one report zones command. 108 + */ 109 + #define SD_ZBC_REPORT_MAX_ZONES 8192U 110 + 111 + /** 112 + * Allocate a buffer for report zones reply. 113 + * @sdkp: The target disk 114 + * @nr_zones: Maximum number of zones to report 115 + * @buflen: Size of the buffer allocated 116 + * 117 + * Try to allocate a reply buffer for the number of requested zones. 118 + * The size of the buffer allocated may be smaller than requested to 119 + * satify the device constraint (max_hw_sectors, max_segments, etc). 120 + * 121 + * Return the address of the allocated buffer and update @buflen with 122 + * the size of the allocated buffer. 123 + */ 124 + static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp, 125 + unsigned int nr_zones, size_t *buflen) 126 + { 127 + struct request_queue *q = sdkp->disk->queue; 128 + size_t bufsize; 129 + void *buf; 130 + 131 + /* 132 + * Report zone buffer size should be at most 64B times the number of 133 + * zones requested plus the 64B reply header, but should be at least 134 + * SECTOR_SIZE for ATA devices. 135 + * Make sure that this size does not exceed the hardware capabilities. 136 + * Furthermore, since the report zone command cannot be split, make 137 + * sure that the allocated buffer can always be mapped by limiting the 138 + * number of pages allocated to the HBA max segments limit. 139 + */ 140 + nr_zones = min(nr_zones, SD_ZBC_REPORT_MAX_ZONES); 141 + bufsize = roundup((nr_zones + 1) * 64, 512); 142 + bufsize = min_t(size_t, bufsize, 143 + queue_max_hw_sectors(q) << SECTOR_SHIFT); 144 + bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT); 145 + 146 + buf = vzalloc(bufsize); 147 + if (buf) 148 + *buflen = bufsize; 149 + 150 + return buf; 151 + } 152 + 107 153 /** 108 154 * sd_zbc_report_zones - Disk report zones operation. 109 155 * @disk: The target disk 110 156 * @sector: Start 512B sector of the report 111 157 * @zones: Array of zone descriptors 112 158 * @nr_zones: Number of descriptors in the array 113 - * @gfp_mask: Memory allocation mask 114 159 * 115 160 * Execute a report zones command on the target disk. 116 161 */ 117 162 int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, 118 - struct blk_zone *zones, unsigned int *nr_zones, 119 - gfp_t gfp_mask) 163 + struct blk_zone *zones, unsigned int *nr_zones) 120 164 { 121 165 struct scsi_disk *sdkp = scsi_disk(disk); 122 - unsigned int i, buflen, nrz = *nr_zones; 166 + unsigned int i, nrz = *nr_zones; 123 167 unsigned char *buf; 124 - size_t offset = 0; 168 + size_t buflen = 0, offset = 0; 125 169 int ret = 0; 126 170 127 171 if (!sd_is_zoned(sdkp)) 128 172 /* Not a zoned device */ 129 173 return -EOPNOTSUPP; 130 174 131 - /* 132 - * Get a reply buffer for the number of requested zones plus a header, 133 - * without exceeding the device maximum command size. For ATA disks, 134 - * buffers must be aligned to 512B. 135 - */ 136 - buflen = min(queue_max_hw_sectors(disk->queue) << 9, 137 - roundup((nrz + 1) * 64, 512)); 138 - buf = kmalloc(buflen, gfp_mask); 175 + buf = sd_zbc_alloc_report_buffer(sdkp, nrz, &buflen); 139 176 if (!buf) 140 177 return -ENOMEM; 141 178 142 179 ret = sd_zbc_do_report_zones(sdkp, buf, buflen, 143 180 sectors_to_logical(sdkp->device, sector), true); 144 181 if (ret) 145 - goto out_free_buf; 182 + goto out; 146 183 147 184 nrz = min(nrz, get_unaligned_be32(&buf[0]) / 64); 148 185 for (i = 0; i < nrz; i++) { ··· 191 152 192 153 *nr_zones = nrz; 193 154 194 - out_free_buf: 195 - kfree(buf); 155 + out: 156 + kvfree(buf); 196 157 197 158 return ret; 198 159 } ··· 326 287 return 0; 327 288 } 328 289 329 - #define SD_ZBC_BUF_SIZE 131072U 330 - 331 290 /** 332 291 * sd_zbc_check_zones - Check the device capacity and zone sizes 333 292 * @sdkp: Target disk ··· 341 304 */ 342 305 static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks) 343 306 { 307 + size_t bufsize, buflen; 308 + unsigned int noio_flag; 344 309 u64 zone_blocks = 0; 345 310 sector_t max_lba, block = 0; 346 311 unsigned char *buf; 347 312 unsigned char *rec; 348 - unsigned int buf_len; 349 - unsigned int list_length; 350 313 int ret; 351 314 u8 same; 352 315 316 + /* Do all memory allocations as if GFP_NOIO was specified */ 317 + noio_flag = memalloc_noio_save(); 318 + 353 319 /* Get a buffer */ 354 - buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL); 355 - if (!buf) 356 - return -ENOMEM; 320 + buf = sd_zbc_alloc_report_buffer(sdkp, SD_ZBC_REPORT_MAX_ZONES, 321 + &bufsize); 322 + if (!buf) { 323 + ret = -ENOMEM; 324 + goto out; 325 + } 357 326 358 327 /* Do a report zone to get max_lba and the same field */ 359 - ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 0, false); 328 + ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, 0, false); 360 329 if (ret) 361 330 goto out_free; 362 331 ··· 398 355 do { 399 356 400 357 /* Parse REPORT ZONES header */ 401 - list_length = get_unaligned_be32(&buf[0]) + 64; 358 + buflen = min_t(size_t, get_unaligned_be32(&buf[0]) + 64, 359 + bufsize); 402 360 rec = buf + 64; 403 - buf_len = min(list_length, SD_ZBC_BUF_SIZE); 404 361 405 362 /* Parse zone descriptors */ 406 - while (rec < buf + buf_len) { 363 + while (rec < buf + buflen) { 407 364 u64 this_zone_blocks = get_unaligned_be64(&rec[8]); 408 365 409 366 if (zone_blocks == 0) { ··· 419 376 } 420 377 421 378 if (block < sdkp->capacity) { 422 - ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 423 - block, true); 379 + ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, block, 380 + true); 424 381 if (ret) 425 382 goto out_free; 426 383 } ··· 451 408 } 452 409 453 410 out_free: 454 - kfree(buf); 411 + memalloc_noio_restore(noio_flag); 412 + kvfree(buf); 455 413 456 414 return ret; 457 415 }

+2 -2

fs/btrfs/extent_io.c

··· 2911 2911 bio = NULL; 2912 2912 } else { 2913 2913 if (wbc) 2914 - wbc_account_io(wbc, page, page_size); 2914 + wbc_account_cgroup_owner(wbc, page, page_size); 2915 2915 return 0; 2916 2916 } 2917 2917 } ··· 2924 2924 bio->bi_opf = opf; 2925 2925 if (wbc) { 2926 2926 wbc_init_bio(wbc, bio); 2927 - wbc_account_io(wbc, page, page_size); 2927 + wbc_account_cgroup_owner(wbc, page, page_size); 2928 2928 } 2929 2929 2930 2930 *bio_ret = bio;

+1 -1

fs/buffer.c

··· 3089 3089 3090 3090 if (wbc) { 3091 3091 wbc_init_bio(wbc, bio); 3092 - wbc_account_io(wbc, bh->b_page, bh->b_size); 3092 + wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size); 3093 3093 } 3094 3094 3095 3095 submit_bio(bio);

+1 -1

fs/ext4/page-io.c

··· 396 396 ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); 397 397 if (ret != bh->b_size) 398 398 goto submit_and_retry; 399 - wbc_account_io(io->io_wbc, page, bh->b_size); 399 + wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size); 400 400 io->io_next_block++; 401 401 return 0; 402 402 }

+3 -3

fs/f2fs/data.c

··· 470 470 } 471 471 472 472 if (fio->io_wbc && !is_read_io(fio->op)) 473 - wbc_account_io(fio->io_wbc, page, PAGE_SIZE); 473 + wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE); 474 474 475 475 bio_set_op_attrs(bio, fio->op, fio->op_flags); 476 476 ··· 513 513 } 514 514 515 515 if (fio->io_wbc) 516 - wbc_account_io(fio->io_wbc, page, PAGE_SIZE); 516 + wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE); 517 517 518 518 inc_page_count(fio->sbi, WB_DATA_TYPE(page)); 519 519 ··· 592 592 } 593 593 594 594 if (fio->io_wbc) 595 - wbc_account_io(fio->io_wbc, bio_page, PAGE_SIZE); 595 + wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE); 596 596 597 597 io->last_block_in_bio = fio->new_blkaddr; 598 598 f2fs_trace_ios(fio, 0);

+1 -3

fs/f2fs/super.c

··· 2818 2818 while (zones && sector < nr_sectors) { 2819 2819 2820 2820 nr_zones = F2FS_REPORT_NR_ZONES; 2821 - err = blkdev_report_zones(bdev, sector, 2822 - zones, &nr_zones, 2823 - GFP_KERNEL); 2821 + err = blkdev_report_zones(bdev, sector, zones, &nr_zones); 2824 2822 if (err) 2825 2823 break; 2826 2824 if (!nr_zones) {

+8 -5

fs/fs-writeback.c

··· 270 270 if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) 271 271 wb_put(wb); 272 272 } 273 + EXPORT_SYMBOL_GPL(__inode_attach_wb); 273 274 274 275 /** 275 276 * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it ··· 583 582 if (unlikely(wb_dying(wbc->wb))) 584 583 inode_switch_wbs(inode, wbc->wb_id); 585 584 } 585 + EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode); 586 586 587 587 /** 588 588 * wbc_detach_inode - disassociate wbc from inode and perform foreign detection ··· 703 701 wb_put(wbc->wb); 704 702 wbc->wb = NULL; 705 703 } 704 + EXPORT_SYMBOL_GPL(wbc_detach_inode); 706 705 707 706 /** 708 - * wbc_account_io - account IO issued during writeback 707 + * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership 709 708 * @wbc: writeback_control of the writeback in progress 710 709 * @page: page being written out 711 710 * @bytes: number of bytes being written out ··· 715 712 * controlled by @wbc. Keep the book for foreign inode detection. See 716 713 * wbc_detach_inode(). 717 714 */ 718 - void wbc_account_io(struct writeback_control *wbc, struct page *page, 719 - size_t bytes) 715 + void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, 716 + size_t bytes) 720 717 { 721 718 struct cgroup_subsys_state *css; 722 719 int id; ··· 727 724 * behind a slow cgroup. Ultimately, we want pageout() to kick off 728 725 * regular writeback instead of writing things out itself. 729 726 */ 730 - if (!wbc->wb) 727 + if (!wbc->wb || wbc->no_cgroup_owner) 731 728 return; 732 729 733 730 css = mem_cgroup_css_from_page(page); ··· 753 750 else 754 751 wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); 755 752 } 756 - EXPORT_SYMBOL_GPL(wbc_account_io); 753 + EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner); 757 754 758 755 /** 759 756 * inode_congested - test whether an inode is congested

+1 -1

fs/mpage.c

··· 647 647 * the confused fail path above (OOM) will be very confused when 648 648 * it finds all bh marked clean (i.e. it will not write anything) 649 649 */ 650 - wbc_account_io(wbc, page, PAGE_SIZE); 650 + wbc_account_cgroup_owner(wbc, page, PAGE_SIZE); 651 651 length = first_unmapped << blkbits; 652 652 if (bio_add_page(bio, page, length, 0) < length) { 653 653 bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);

+1 -1

fs/xfs/xfs_aops.c

··· 796 796 } 797 797 798 798 wpc->ioend->io_size += len; 799 - wbc_account_io(wbc, page, len); 799 + wbc_account_cgroup_owner(wbc, page, len); 800 800 } 801 801 802 802 STATIC void

+1

include/linux/backing-dev.h

··· 48 48 extern struct list_head bdi_list; 49 49 50 50 extern struct workqueue_struct *bdi_wq; 51 + extern struct workqueue_struct *bdi_async_bio_wq; 51 52 52 53 static inline bool wb_has_dirty_io(struct bdi_writeback *wb) 53 54 {

+15 -1

include/linux/blk-cgroup.h

··· 132 132 133 133 struct blkg_policy_data *pd[BLKCG_MAX_POLS]; 134 134 135 - struct rcu_head rcu_head; 135 + spinlock_t async_bio_lock; 136 + struct bio_list async_bios; 137 + struct work_struct async_bio_work; 136 138 137 139 atomic_t use_delay; 138 140 atomic64_t delay_nsec; 139 141 atomic64_t delay_start; 140 142 u64 last_delay; 141 143 int last_use; 144 + 145 + struct rcu_head rcu_head; 142 146 }; 143 147 144 148 typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); ··· 705 701 struct bio *bio) { return false; } 706 702 #endif 707 703 704 + bool __blkcg_punt_bio_submit(struct bio *bio); 705 + 706 + static inline bool blkcg_punt_bio_submit(struct bio *bio) 707 + { 708 + if (bio->bi_opf & REQ_CGROUP_PUNT) 709 + return __blkcg_punt_bio_submit(bio); 710 + else 711 + return false; 712 + } 708 713 709 714 static inline void blkcg_bio_issue_init(struct bio *bio) 710 715 { ··· 861 848 static inline void blkg_get(struct blkcg_gq *blkg) { } 862 849 static inline void blkg_put(struct blkcg_gq *blkg) { } 863 850 851 + static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; } 864 852 static inline void blkcg_bio_issue_init(struct bio *bio) { } 865 853 static inline bool blkcg_bio_issue_check(struct request_queue *q, 866 854 struct bio *bio) { return true; }

+10

include/linux/blk_types.h

··· 311 311 __REQ_RAHEAD, /* read ahead, can fail anytime */ 312 312 __REQ_BACKGROUND, /* background IO */ 313 313 __REQ_NOWAIT, /* Don't wait if request will block */ 314 + /* 315 + * When a shared kthread needs to issue a bio for a cgroup, doing 316 + * so synchronously can lead to priority inversions as the kthread 317 + * can be trapped waiting for that cgroup. CGROUP_PUNT flag makes 318 + * submit_bio() punt the actual issuing to a dedicated per-blkcg 319 + * work item to avoid such priority inversions. 320 + */ 321 + __REQ_CGROUP_PUNT, 314 322 315 323 /* command specific flags for REQ_OP_WRITE_ZEROES: */ 316 324 __REQ_NOUNMAP, /* do not free blocks when zeroing */ ··· 345 337 #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) 346 338 #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) 347 339 #define REQ_NOWAIT (1ULL << __REQ_NOWAIT) 340 + #define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT) 341 + 348 342 #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) 349 343 #define REQ_HIPRI (1ULL << __REQ_HIPRI) 350 344

+9 -5

include/linux/blkdev.h

··· 344 344 345 345 #ifdef CONFIG_BLK_DEV_ZONED 346 346 347 + /* 348 + * Maximum number of zones to report with a single report zones command. 349 + */ 350 + #define BLK_ZONED_REPORT_MAX_ZONES 8192U 351 + 347 352 extern unsigned int blkdev_nr_zones(struct block_device *bdev); 348 353 extern int blkdev_report_zones(struct block_device *bdev, 349 354 sector_t sector, struct blk_zone *zones, 350 - unsigned int *nr_zones, gfp_t gfp_mask); 355 + unsigned int *nr_zones); 351 356 extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors, 352 357 sector_t nr_sectors, gfp_t gfp_mask); 353 358 extern int blk_revalidate_disk_zones(struct gendisk *disk); ··· 686 681 } 687 682 } 688 683 689 - static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) 684 + static inline sector_t blk_queue_zone_sectors(struct request_queue *q) 690 685 { 691 686 return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; 692 687 } ··· 1423 1418 return false; 1424 1419 } 1425 1420 1426 - static inline unsigned int bdev_zone_sectors(struct block_device *bdev) 1421 + static inline sector_t bdev_zone_sectors(struct block_device *bdev) 1427 1422 { 1428 1423 struct request_queue *q = bdev_get_queue(bdev); 1429 1424 ··· 1678 1673 /* this callback is with swap_lock and sometimes page table lock held */ 1679 1674 void (*swap_slot_free_notify) (struct block_device *, unsigned long); 1680 1675 int (*report_zones)(struct gendisk *, sector_t sector, 1681 - struct blk_zone *zones, unsigned int *nr_zones, 1682 - gfp_t gfp_mask); 1676 + struct blk_zone *zones, unsigned int *nr_zones); 1683 1677 struct module *owner; 1684 1678 const struct pr_ops *pr_ops; 1685 1679 };

+1

include/linux/cgroup.h

··· 699 699 struct cgroup_subsys_state; 700 700 struct cgroup; 701 701 702 + static inline void css_get(struct cgroup_subsys_state *css) {} 702 703 static inline void css_put(struct cgroup_subsys_state *css) {} 703 704 static inline int cgroup_attach_task_all(struct task_struct *from, 704 705 struct task_struct *t) { return 0; }

+1 -2

include/linux/device-mapper.h

··· 95 95 96 96 typedef int (*dm_report_zones_fn) (struct dm_target *ti, sector_t sector, 97 97 struct blk_zone *zones, 98 - unsigned int *nr_zones, 99 - gfp_t gfp_mask); 98 + unsigned int *nr_zones); 100 99 101 100 /* 102 101 * These iteration functions are typically used to check (and combine)

+1 -10

include/linux/elevator.h

··· 75 75 size_t icq_size; /* see iocontext.h */ 76 76 size_t icq_align; /* ditto */ 77 77 struct elv_fs_entry *elevator_attrs; 78 - char elevator_name[ELV_NAME_MAX]; 78 + const char *elevator_name; 79 79 const char *elevator_alias; 80 80 struct module *elevator_owner; 81 81 #ifdef CONFIG_BLK_DEBUG_FS ··· 159 159 #define ELEVATOR_INSERT_REQUEUE 4 160 160 #define ELEVATOR_INSERT_FLUSH 5 161 161 #define ELEVATOR_INSERT_SORT_MERGE 6 162 - 163 - /* 164 - * return values from elevator_may_queue_fn 165 - */ 166 - enum { 167 - ELV_MQUEUE_MAY, 168 - ELV_MQUEUE_NO, 169 - ELV_MQUEUE_MUST, 170 - }; 171 162 172 163 #define rq_end_sector(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) 173 164 #define rb_entry_rq(node) rb_entry((node), struct request, rb_node)

+9 -3

include/linux/nvme.h

··· 315 315 __u8 nmic; 316 316 __u8 rescap; 317 317 __u8 fpi; 318 - __u8 rsvd33; 318 + __u8 dlfeat; 319 319 __le16 nawun; 320 320 __le16 nawupf; 321 321 __le16 nacwu; ··· 324 324 __le16 nabspf; 325 325 __le16 noiob; 326 326 __u8 nvmcap[16]; 327 - __u8 rsvd64[28]; 327 + __le16 npwg; 328 + __le16 npwa; 329 + __le16 npdg; 330 + __le16 npda; 331 + __le16 nows; 332 + __u8 rsvd74[18]; 328 333 __le32 anagrpid; 329 334 __u8 rsvd96[3]; 330 335 __u8 nsattr; 331 - __u8 rsvd100[4]; 336 + __le16 nvmsetid; 337 + __le16 endgid; 332 338 __u8 nguid[16]; 333 339 __u8 eui64[8]; 334 340 struct nvme_lbaf lbaf[16];

+36 -9

include/linux/writeback.h

··· 11 11 #include <linux/flex_proportions.h> 12 12 #include <linux/backing-dev-defs.h> 13 13 #include <linux/blk_types.h> 14 + #include <linux/blk-cgroup.h> 14 15 15 16 struct bio; 16 17 ··· 69 68 unsigned for_reclaim:1; /* Invoked from the page allocator */ 70 69 unsigned range_cyclic:1; /* range_start is cyclic */ 71 70 unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ 71 + 72 + /* 73 + * When writeback IOs are bounced through async layers, only the 74 + * initial synchronous phase should be accounted towards inode 75 + * cgroup ownership arbitration to avoid confusion. Later stages 76 + * can set the following flag to disable the accounting. 77 + */ 78 + unsigned no_cgroup_owner:1; 79 + 80 + unsigned punt_to_cgroup:1; /* cgrp punting, see __REQ_CGROUP_PUNT */ 81 + 72 82 #ifdef CONFIG_CGROUP_WRITEBACK 73 83 struct bdi_writeback *wb; /* wb this writeback is issued under */ 74 84 struct inode *inode; /* inode being written out */ ··· 96 84 97 85 static inline int wbc_to_write_flags(struct writeback_control *wbc) 98 86 { 99 - if (wbc->sync_mode == WB_SYNC_ALL) 100 - return REQ_SYNC; 101 - else if (wbc->for_kupdate || wbc->for_background) 102 - return REQ_BACKGROUND; 87 + int flags = 0; 103 88 104 - return 0; 89 + if (wbc->punt_to_cgroup) 90 + flags = REQ_CGROUP_PUNT; 91 + 92 + if (wbc->sync_mode == WB_SYNC_ALL) 93 + flags |= REQ_SYNC; 94 + else if (wbc->for_kupdate || wbc->for_background) 95 + flags |= REQ_BACKGROUND; 96 + 97 + return flags; 98 + } 99 + 100 + static inline struct cgroup_subsys_state * 101 + wbc_blkcg_css(struct writeback_control *wbc) 102 + { 103 + #ifdef CONFIG_CGROUP_WRITEBACK 104 + if (wbc->wb) 105 + return wbc->wb->blkcg_css; 106 + #endif 107 + return blkcg_root_css; 105 108 } 106 109 107 110 /* ··· 215 188 struct inode *inode) 216 189 __releases(&inode->i_lock); 217 190 void wbc_detach_inode(struct writeback_control *wbc); 218 - void wbc_account_io(struct writeback_control *wbc, struct page *page, 219 - size_t bytes); 191 + void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, 192 + size_t bytes); 220 193 void cgroup_writeback_umount(void); 221 194 222 195 /** ··· 318 291 { 319 292 } 320 293 321 - static inline void wbc_account_io(struct writeback_control *wbc, 322 - struct page *page, size_t bytes) 294 + static inline void wbc_account_cgroup_owner(struct writeback_control *wbc, 295 + struct page *page, size_t bytes) 323 296 { 324 297 } 325 298

Configure Feed

Configure Feed