Merge tag 'for-6.16/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

+4

block/blk-crypto-profile.c

··· 501 501 blk_crypto_hw_exit(profile); 502 502 return err; 503 503 } 504 + EXPORT_SYMBOL_GPL(blk_crypto_derive_sw_secret); 504 505 505 506 int blk_crypto_import_key(struct blk_crypto_profile *profile, 506 507 const u8 *raw_key, size_t raw_key_size, ··· 521 520 blk_crypto_hw_exit(profile); 522 521 return ret; 523 522 } 523 + EXPORT_SYMBOL_GPL(blk_crypto_import_key); 524 524 525 525 int blk_crypto_generate_key(struct blk_crypto_profile *profile, 526 526 u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) ··· 539 537 blk_crypto_hw_exit(profile); 540 538 return ret; 541 539 } 540 + EXPORT_SYMBOL_GPL(blk_crypto_generate_key); 542 541 543 542 int blk_crypto_prepare_key(struct blk_crypto_profile *profile, 544 543 const u8 *lt_key, size_t lt_key_size, ··· 559 556 blk_crypto_hw_exit(profile); 560 557 return ret; 561 558 } 559 + EXPORT_SYMBOL_GPL(blk_crypto_prepare_key); 562 560 563 561 /** 564 562 * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities

+36 -153

drivers/md/dm-bufio.c

··· 41 41 #define DM_BUFIO_LOW_WATERMARK_RATIO 16 42 42 43 43 /* 44 - * Check buffer ages in this interval (seconds) 45 - */ 46 - #define DM_BUFIO_WORK_TIMER_SECS 30 47 - 48 - /* 49 - * Free buffers when they are older than this (seconds) 50 - */ 51 - #define DM_BUFIO_DEFAULT_AGE_SECS 300 52 - 53 - /* 54 44 * The nr of bytes of cached data to keep around. 55 45 */ 56 46 #define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024) ··· 1047 1057 1048 1058 static DEFINE_SPINLOCK(global_spinlock); 1049 1059 1050 - /* 1051 - * Buffers are freed after this timeout 1052 - */ 1053 - static unsigned int dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; 1060 + static unsigned int dm_bufio_max_age; /* No longer does anything */ 1061 + 1054 1062 static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; 1055 1063 1056 1064 static unsigned long dm_bufio_peak_allocated; ··· 1076 1088 static DEFINE_MUTEX(dm_bufio_clients_lock); 1077 1089 1078 1090 static struct workqueue_struct *dm_bufio_wq; 1079 - static struct delayed_work dm_bufio_cleanup_old_work; 1080 1091 static struct work_struct dm_bufio_replacement_work; 1081 1092 1082 1093 ··· 2667 2680 2668 2681 /*--------------------------------------------------------------*/ 2669 2682 2670 - static unsigned int get_max_age_hz(void) 2671 - { 2672 - unsigned int max_age = READ_ONCE(dm_bufio_max_age); 2673 - 2674 - if (max_age > UINT_MAX / HZ) 2675 - max_age = UINT_MAX / HZ; 2676 - 2677 - return max_age * HZ; 2678 - } 2679 - 2680 - static bool older_than(struct dm_buffer *b, unsigned long age_hz) 2681 - { 2682 - return time_after_eq(jiffies, READ_ONCE(b->last_accessed) + age_hz); 2683 - } 2684 - 2685 - struct evict_params { 2686 - gfp_t gfp; 2687 - unsigned long age_hz; 2688 - 2689 - /* 2690 - * This gets updated with the largest last_accessed (ie. most 2691 - * recently used) of the evicted buffers. It will not be reinitialised 2692 - * by __evict_many(), so you can use it across multiple invocations. 2693 - */ 2694 - unsigned long last_accessed; 2695 - }; 2696 - 2697 - /* 2698 - * We may not be able to evict this buffer if IO pending or the client 2699 - * is still using it. 2700 - * 2701 - * And if GFP_NOFS is used, we must not do any I/O because we hold 2702 - * dm_bufio_clients_lock and we would risk deadlock if the I/O gets 2703 - * rerouted to different bufio client. 2704 - */ 2705 - static enum evict_result select_for_evict(struct dm_buffer *b, void *context) 2706 - { 2707 - struct evict_params *params = context; 2708 - 2709 - if (!(params->gfp & __GFP_FS) || 2710 - (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) { 2711 - if (test_bit_acquire(B_READING, &b->state) || 2712 - test_bit(B_WRITING, &b->state) || 2713 - test_bit(B_DIRTY, &b->state)) 2714 - return ER_DONT_EVICT; 2715 - } 2716 - 2717 - return older_than(b, params->age_hz) ? ER_EVICT : ER_STOP; 2718 - } 2719 - 2720 - static unsigned long __evict_many(struct dm_bufio_client *c, 2721 - struct evict_params *params, 2722 - int list_mode, unsigned long max_count) 2723 - { 2724 - unsigned long count; 2725 - unsigned long last_accessed; 2726 - struct dm_buffer *b; 2727 - 2728 - for (count = 0; count < max_count; count++) { 2729 - b = cache_evict(&c->cache, list_mode, select_for_evict, params); 2730 - if (!b) 2731 - break; 2732 - 2733 - last_accessed = READ_ONCE(b->last_accessed); 2734 - if (time_after_eq(params->last_accessed, last_accessed)) 2735 - params->last_accessed = last_accessed; 2736 - 2737 - __make_buffer_clean(b); 2738 - __free_buffer_wake(b); 2739 - 2740 - cond_resched(); 2741 - } 2742 - 2743 - return count; 2744 - } 2745 - 2746 - static void evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) 2747 - { 2748 - struct evict_params params = {.gfp = 0, .age_hz = age_hz, .last_accessed = 0}; 2749 - unsigned long retain = get_retain_buffers(c); 2750 - unsigned long count; 2751 - LIST_HEAD(write_list); 2752 - 2753 - dm_bufio_lock(c); 2754 - 2755 - __check_watermark(c, &write_list); 2756 - if (unlikely(!list_empty(&write_list))) { 2757 - dm_bufio_unlock(c); 2758 - __flush_write_list(&write_list); 2759 - dm_bufio_lock(c); 2760 - } 2761 - 2762 - count = cache_total(&c->cache); 2763 - if (count > retain) 2764 - __evict_many(c, &params, LIST_CLEAN, count - retain); 2765 - 2766 - dm_bufio_unlock(c); 2767 - } 2768 - 2769 - static void cleanup_old_buffers(void) 2770 - { 2771 - unsigned long max_age_hz = get_max_age_hz(); 2772 - struct dm_bufio_client *c; 2773 - 2774 - mutex_lock(&dm_bufio_clients_lock); 2775 - 2776 - __cache_size_refresh(); 2777 - 2778 - list_for_each_entry(c, &dm_bufio_all_clients, client_list) 2779 - evict_old_buffers(c, max_age_hz); 2780 - 2781 - mutex_unlock(&dm_bufio_clients_lock); 2782 - } 2783 - 2784 - static void work_fn(struct work_struct *w) 2785 - { 2786 - cleanup_old_buffers(); 2787 - 2788 - queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work, 2789 - DM_BUFIO_WORK_TIMER_SECS * HZ); 2790 - } 2791 - 2792 - /*--------------------------------------------------------------*/ 2793 - 2794 2683 /* 2795 2684 * Global cleanup tries to evict the oldest buffers from across _all_ 2796 2685 * the clients. It does this by repeatedly evicting a few buffers from ··· 2704 2841 list_add_tail(&new_client->client_list, h); 2705 2842 } 2706 2843 2844 + static enum evict_result select_for_evict(struct dm_buffer *b, void *context) 2845 + { 2846 + /* In no-sleep mode, we cannot wait on IO. */ 2847 + if (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep) { 2848 + if (test_bit_acquire(B_READING, &b->state) || 2849 + test_bit(B_WRITING, &b->state) || 2850 + test_bit(B_DIRTY, &b->state)) 2851 + return ER_DONT_EVICT; 2852 + } 2853 + return ER_EVICT; 2854 + } 2855 + 2707 2856 static unsigned long __evict_a_few(unsigned long nr_buffers) 2708 2857 { 2709 - unsigned long count; 2710 2858 struct dm_bufio_client *c; 2711 - struct evict_params params = { 2712 - .gfp = GFP_KERNEL, 2713 - .age_hz = 0, 2714 - /* set to jiffies in case there are no buffers in this client */ 2715 - .last_accessed = jiffies 2716 - }; 2859 + unsigned long oldest_buffer = jiffies; 2860 + unsigned long last_accessed; 2861 + unsigned long count; 2862 + struct dm_buffer *b; 2717 2863 2718 2864 c = __pop_client(); 2719 2865 if (!c) 2720 2866 return 0; 2721 2867 2722 2868 dm_bufio_lock(c); 2723 - count = __evict_many(c, &params, LIST_CLEAN, nr_buffers); 2869 + 2870 + for (count = 0; count < nr_buffers; count++) { 2871 + b = cache_evict(&c->cache, LIST_CLEAN, select_for_evict, NULL); 2872 + if (!b) 2873 + break; 2874 + 2875 + last_accessed = READ_ONCE(b->last_accessed); 2876 + if (time_after_eq(oldest_buffer, last_accessed)) 2877 + oldest_buffer = last_accessed; 2878 + 2879 + __make_buffer_clean(b); 2880 + __free_buffer_wake(b); 2881 + 2882 + cond_resched(); 2883 + } 2884 + 2724 2885 dm_bufio_unlock(c); 2725 2886 2726 2887 if (count) 2727 - c->oldest_buffer = params.last_accessed; 2888 + c->oldest_buffer = oldest_buffer; 2728 2889 __insert_client(c); 2729 2890 2730 2891 return count; ··· 2831 2944 if (!dm_bufio_wq) 2832 2945 return -ENOMEM; 2833 2946 2834 - INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn); 2835 2947 INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup); 2836 - queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work, 2837 - DM_BUFIO_WORK_TIMER_SECS * HZ); 2838 2948 2839 2949 return 0; 2840 2950 } ··· 2843 2959 { 2844 2960 int bug = 0; 2845 2961 2846 - cancel_delayed_work_sync(&dm_bufio_cleanup_old_work); 2847 2962 destroy_workqueue(dm_bufio_wq); 2848 2963 2849 2964 if (dm_bufio_client_count) { ··· 2879 2996 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); 2880 2997 2881 2998 module_param_named(max_age_seconds, dm_bufio_max_age, uint, 0644); 2882 - MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); 2999 + MODULE_PARM_DESC(max_age_seconds, "No longer does anything"); 2883 3000 2884 3001 module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, 0644); 2885 3002 MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");

+1 -3

drivers/md/dm-core.h

··· 141 141 #ifdef CONFIG_BLK_DEV_ZONED 142 142 unsigned int nr_zones; 143 143 void *zone_revalidate_map; 144 + struct task_struct *revalidate_map_task; 144 145 #endif 145 146 146 147 #ifdef CONFIG_IMA ··· 162 161 #define DMF_SUSPENDED_INTERNALLY 7 163 162 #define DMF_POST_SUSPENDING 8 164 163 #define DMF_EMULATE_ZONE_APPEND 9 165 - 166 - void disable_discard(struct mapped_device *md); 167 - void disable_write_zeroes(struct mapped_device *md); 168 164 169 165 static inline sector_t dm_get_size(struct mapped_device *md) 170 166 {

+14 -3

drivers/md/dm-delay.c

··· 14 14 #include <linux/bio.h> 15 15 #include <linux/slab.h> 16 16 #include <linux/kthread.h> 17 + #include <linux/delay.h> 17 18 18 19 #include <linux/device-mapper.h> 19 20 20 21 #define DM_MSG_PREFIX "delay" 22 + 23 + #define SLEEP_SHIFT 3 21 24 22 25 struct delay_class { 23 26 struct dm_dev *dev; ··· 37 34 struct work_struct flush_expired_bios; 38 35 struct list_head delayed_bios; 39 36 struct task_struct *worker; 37 + unsigned int worker_sleep_us; 40 38 bool may_delay; 41 39 42 40 struct delay_class read; ··· 140 136 schedule(); 141 137 } else { 142 138 spin_unlock(&dc->delayed_bios_lock); 139 + fsleep(dc->worker_sleep_us); 143 140 cond_resched(); 144 141 } 145 142 } ··· 217 212 { 218 213 struct delay_c *dc; 219 214 int ret; 220 - unsigned int max_delay; 215 + unsigned int max_delay, min_delay; 221 216 222 217 if (argc != 3 && argc != 6 && argc != 9) { 223 218 ti->error = "Requires exactly 3, 6 or 9 arguments"; ··· 240 235 ret = delay_class_ctr(ti, &dc->read, argv); 241 236 if (ret) 242 237 goto bad; 243 - max_delay = dc->read.delay; 238 + min_delay = max_delay = dc->read.delay; 244 239 245 240 if (argc == 3) { 246 241 ret = delay_class_ctr(ti, &dc->write, argv); ··· 256 251 if (ret) 257 252 goto bad; 258 253 max_delay = max(max_delay, dc->write.delay); 254 + min_delay = min_not_zero(min_delay, dc->write.delay); 259 255 260 256 if (argc == 6) { 261 257 ret = delay_class_ctr(ti, &dc->flush, argv + 3); ··· 269 263 if (ret) 270 264 goto bad; 271 265 max_delay = max(max_delay, dc->flush.delay); 266 + min_delay = min_not_zero(min_delay, dc->flush.delay); 272 267 273 268 out: 274 269 if (max_delay < 50) { 270 + if (min_delay >> SLEEP_SHIFT) 271 + dc->worker_sleep_us = 1000; 272 + else 273 + dc->worker_sleep_us = (min_delay * 1000) >> SLEEP_SHIFT; 275 274 /* 276 275 * In case of small requested delays, use kthread instead of 277 276 * timers and workqueue to achieve better latency. ··· 449 438 450 439 static struct target_type delay_target = { 451 440 .name = "delay", 452 - .version = {1, 4, 0}, 441 + .version = {1, 5, 0}, 453 442 .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM, 454 443 .module = THIS_MODULE, 455 444 .ctr = delay_ctr,

+3 -1

drivers/md/dm-dust.c

··· 534 534 } 535 535 } 536 536 537 - static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) 537 + static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, 538 + unsigned int cmd, unsigned long arg, 539 + bool *forward) 538 540 { 539 541 struct dust_device *dd = ti->private; 540 542 struct dm_dev *dev = dd->dev;

+2 -1

drivers/md/dm-ebs-target.c

··· 415 415 } 416 416 } 417 417 418 - static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) 418 + static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, 419 + unsigned int cmd, unsigned long arg, bool *forward) 419 420 { 420 421 struct ebs_c *ec = ti->private; 421 422 struct dm_dev *dev = ec->dev;

+65 -53

drivers/md/dm-flakey.c

··· 47 47 }; 48 48 49 49 struct per_bio_data { 50 - bool bio_submitted; 50 + bool bio_can_corrupt; 51 + struct bvec_iter saved_iter; 51 52 }; 52 53 53 54 static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, 54 55 struct dm_target *ti) 55 56 { 56 - int r; 57 - unsigned int argc; 57 + int r = 0; 58 + unsigned int argc = 0; 58 59 const char *arg_name; 59 60 60 61 static const struct dm_arg _args[] = { ··· 66 65 {0, PROBABILITY_BASE, "Invalid random corrupt argument"}, 67 66 }; 68 67 69 - /* No feature arguments supplied. */ 70 - if (!as->argc) 71 - return 0; 72 - 73 - r = dm_read_arg_group(_args, as, &argc, &ti->error); 74 - if (r) 68 + if (as->argc && (r = dm_read_arg_group(_args, as, &argc, &ti->error))) 75 69 return r; 70 + 71 + /* No feature arguments supplied. */ 72 + if (!argc) 73 + goto error_all_io; 76 74 77 75 while (argc) { 78 76 arg_name = dm_shift_arg(as); ··· 128 128 * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags> 129 129 */ 130 130 if (!strcasecmp(arg_name, "corrupt_bio_byte")) { 131 - if (!argc) { 132 - ti->error = "Feature corrupt_bio_byte requires parameters"; 131 + if (fc->corrupt_bio_byte) { 132 + ti->error = "Feature corrupt_bio_byte duplicated"; 133 + return -EINVAL; 134 + } else if (argc < 4) { 135 + ti->error = "Feature corrupt_bio_byte requires 4 parameters"; 133 136 return -EINVAL; 134 137 } 135 138 ··· 179 176 } 180 177 181 178 if (!strcasecmp(arg_name, "random_read_corrupt")) { 182 - if (!argc) { 179 + if (fc->random_read_corrupt) { 180 + ti->error = "Feature random_read_corrupt duplicated"; 181 + return -EINVAL; 182 + } else if (!argc) { 183 183 ti->error = "Feature random_read_corrupt requires a parameter"; 184 184 return -EINVAL; 185 185 } ··· 195 189 } 196 190 197 191 if (!strcasecmp(arg_name, "random_write_corrupt")) { 198 - if (!argc) { 192 + if (fc->random_write_corrupt) { 193 + ti->error = "Feature random_write_corrupt duplicated"; 194 + return -EINVAL; 195 + } else if (!argc) { 199 196 ti->error = "Feature random_write_corrupt requires a parameter"; 200 197 return -EINVAL; 201 198 } ··· 214 205 return -EINVAL; 215 206 } 216 207 217 - if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) { 218 - ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set"; 208 + if (test_bit(DROP_WRITES, &fc->flags) && 209 + (fc->corrupt_bio_rw == WRITE || fc->random_write_corrupt)) { 210 + ti->error = "drop_writes is incompatible with random_write_corrupt or corrupt_bio_byte with the WRITE flag set"; 219 211 return -EINVAL; 220 212 221 - } else if (test_bit(ERROR_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) { 222 - ti->error = "error_writes is incompatible with corrupt_bio_byte with the WRITE flag set"; 213 + } else if (test_bit(ERROR_WRITES, &fc->flags) && 214 + (fc->corrupt_bio_rw == WRITE || fc->random_write_corrupt)) { 215 + ti->error = "error_writes is incompatible with random_write_corrupt or corrupt_bio_byte with the WRITE flag set"; 216 + return -EINVAL; 217 + } else if (test_bit(ERROR_READS, &fc->flags) && 218 + (fc->corrupt_bio_rw == READ || fc->random_read_corrupt)) { 219 + ti->error = "error_reads is incompatible with random_read_corrupt or corrupt_bio_byte with the READ flag set"; 223 220 return -EINVAL; 224 221 } 225 222 226 223 if (!fc->corrupt_bio_byte && !test_bit(ERROR_READS, &fc->flags) && 227 224 !test_bit(DROP_WRITES, &fc->flags) && !test_bit(ERROR_WRITES, &fc->flags) && 228 225 !fc->random_read_corrupt && !fc->random_write_corrupt) { 226 + error_all_io: 229 227 set_bit(ERROR_WRITES, &fc->flags); 230 228 set_bit(ERROR_READS, &fc->flags); 231 229 } ··· 294 278 if (r) 295 279 goto bad; 296 280 297 - r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error); 281 + r = dm_read_arg(_args + 1, &as, &fc->down_interval, &ti->error); 298 282 if (r) 299 283 goto bad; 300 284 ··· 355 339 } 356 340 357 341 static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte, 358 - unsigned char corrupt_bio_value) 342 + unsigned char corrupt_bio_value, 343 + struct bvec_iter start) 359 344 { 360 345 struct bvec_iter iter; 361 346 struct bio_vec bvec; ··· 365 348 * Overwrite the Nth byte of the bio's data, on whichever page 366 349 * it falls. 367 350 */ 368 - bio_for_each_segment(bvec, bio, iter) { 351 + __bio_for_each_segment(bvec, bio, iter, start) { 369 352 if (bio_iter_len(bio, iter) > corrupt_bio_byte) { 370 353 unsigned char *segment = bvec_kmap_local(&bvec); 371 354 segment[corrupt_bio_byte] = corrupt_bio_value; ··· 374 357 "(rw=%c bi_opf=%u bi_sector=%llu size=%u)\n", 375 358 bio, corrupt_bio_value, corrupt_bio_byte, 376 359 (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf, 377 - (unsigned long long)bio->bi_iter.bi_sector, 378 - bio->bi_iter.bi_size); 360 + (unsigned long long)start.bi_sector, 361 + start.bi_size); 379 362 break; 380 363 } 381 364 corrupt_bio_byte -= bio_iter_len(bio, iter); 382 365 } 383 366 } 384 367 385 - static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) 368 + static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc, 369 + struct bvec_iter start) 386 370 { 387 371 unsigned int corrupt_bio_byte = fc->corrupt_bio_byte - 1; 388 372 389 - if (!bio_has_data(bio)) 390 - return; 391 - 392 - corrupt_bio_common(bio, corrupt_bio_byte, fc->corrupt_bio_value); 373 + corrupt_bio_common(bio, corrupt_bio_byte, fc->corrupt_bio_value, start); 393 374 } 394 375 395 - static void corrupt_bio_random(struct bio *bio) 376 + static void corrupt_bio_random(struct bio *bio, struct bvec_iter start) 396 377 { 397 378 unsigned int corrupt_byte; 398 379 unsigned char corrupt_value; 399 380 400 - if (!bio_has_data(bio)) 401 - return; 402 - 403 - corrupt_byte = get_random_u32() % bio->bi_iter.bi_size; 381 + corrupt_byte = get_random_u32() % start.bi_size; 404 382 corrupt_value = get_random_u8(); 405 383 406 - corrupt_bio_common(bio, corrupt_byte, corrupt_value); 384 + corrupt_bio_common(bio, corrupt_byte, corrupt_value, start); 407 385 } 408 386 409 387 static void clone_free(struct bio *clone) ··· 493 481 unsigned int elapsed; 494 482 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); 495 483 496 - pb->bio_submitted = false; 484 + pb->bio_can_corrupt = false; 497 485 498 486 if (op_is_zone_mgmt(bio_op(bio))) 499 487 goto map_bio; ··· 502 490 elapsed = (jiffies - fc->start_time) / HZ; 503 491 if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) { 504 492 bool corrupt_fixed, corrupt_random; 505 - /* 506 - * Flag this bio as submitted while down. 507 - */ 508 - pb->bio_submitted = true; 493 + 494 + if (bio_has_data(bio)) { 495 + pb->bio_can_corrupt = true; 496 + pb->saved_iter = bio->bi_iter; 497 + } 509 498 510 499 /* 511 - * Error reads if neither corrupt_bio_byte or drop_writes or error_writes are set. 512 - * Otherwise, flakey_end_io() will decide if the reads should be modified. 500 + * If ERROR_READS isn't set flakey_end_io() will decide if the 501 + * reads should be modified. 513 502 */ 514 503 if (bio_data_dir(bio) == READ) { 515 504 if (test_bit(ERROR_READS, &fc->flags)) ··· 529 516 return DM_MAPIO_SUBMITTED; 530 517 } 531 518 519 + if (!pb->bio_can_corrupt) 520 + goto map_bio; 532 521 /* 533 522 * Corrupt matching writes. 534 523 */ ··· 550 535 struct bio *clone = clone_bio(ti, fc, bio); 551 536 if (clone) { 552 537 if (corrupt_fixed) 553 - corrupt_bio_data(clone, fc); 538 + corrupt_bio_data(clone, fc, 539 + clone->bi_iter); 554 540 if (corrupt_random) 555 - corrupt_bio_random(clone); 541 + corrupt_bio_random(clone, 542 + clone->bi_iter); 556 543 submit_bio(clone); 557 544 return DM_MAPIO_SUBMITTED; 558 545 } ··· 576 559 if (op_is_zone_mgmt(bio_op(bio))) 577 560 return DM_ENDIO_DONE; 578 561 579 - if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) { 562 + if (!*error && pb->bio_can_corrupt && (bio_data_dir(bio) == READ)) { 580 563 if (fc->corrupt_bio_byte) { 581 564 if ((fc->corrupt_bio_rw == READ) && 582 565 all_corrupt_bio_flags_match(bio, fc)) { 583 566 /* 584 567 * Corrupt successful matching READs while in down state. 585 568 */ 586 - corrupt_bio_data(bio, fc); 569 + corrupt_bio_data(bio, fc, pb->saved_iter); 587 570 } 588 571 } 589 572 if (fc->random_read_corrupt) { 590 573 u64 rnd = get_random_u64(); 591 574 u32 rem = do_div(rnd, PROBABILITY_BASE); 592 575 if (rem < fc->random_read_corrupt) 593 - corrupt_bio_random(bio); 594 - } 595 - if (test_bit(ERROR_READS, &fc->flags)) { 596 - /* 597 - * Error read during the down_interval if drop_writes 598 - * and error_writes were not configured. 599 - */ 600 - *error = BLK_STS_IOERR; 576 + corrupt_bio_random(bio, pb->saved_iter); 601 577 } 602 578 } 603 579 ··· 648 638 } 649 639 } 650 640 651 - static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) 641 + static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, 642 + unsigned int cmd, unsigned long arg, 643 + bool *forward) 652 644 { 653 645 struct flakey_c *fc = ti->private; 654 646

+1

drivers/md/dm-ioctl.c

··· 1885 1885 {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}, 1886 1886 {DM_DEV_ARM_POLL_CMD, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll}, 1887 1887 {DM_GET_TARGET_VERSION_CMD, 0, get_target_version}, 1888 + {DM_MPATH_PROBE_PATHS_CMD, 0, NULL}, /* block device ioctl */ 1888 1889 }; 1889 1890 1890 1891 if (unlikely(cmd >= ARRAY_SIZE(_ioctls)))

+3 -1

drivers/md/dm-linear.c

··· 119 119 } 120 120 } 121 121 122 - static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) 122 + static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, 123 + unsigned int cmd, unsigned long arg, 124 + bool *forward) 123 125 { 124 126 struct linear_c *lc = ti->private; 125 127 struct dm_dev *dev = lc->dev;

+3 -1

drivers/md/dm-log-writes.c

··· 818 818 } 819 819 820 820 static int log_writes_prepare_ioctl(struct dm_target *ti, 821 - struct block_device **bdev) 821 + struct block_device **bdev, 822 + unsigned int cmd, unsigned long arg, 823 + bool *forward) 822 824 { 823 825 struct log_writes_c *lc = ti->private; 824 826 struct dm_dev *dev = lc->dev;

+189 -64

drivers/md/dm-mpath.c

··· 79 79 struct pgpath *current_pgpath; 80 80 struct priority_group *current_pg; 81 81 struct priority_group *next_pg; /* Switch to this PG if set */ 82 + struct priority_group *last_probed_pg; 82 83 83 84 atomic_t nr_valid_paths; /* Total number of usable paths */ 84 85 unsigned int nr_priority_groups; ··· 88 87 const char *hw_handler_name; 89 88 char *hw_handler_params; 90 89 wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ 90 + wait_queue_head_t probe_wait; /* Wait for probing paths */ 91 91 unsigned int pg_init_retries; /* Number of times to retry pg_init */ 92 92 unsigned int pg_init_delay_msecs; /* Number of msecs before pg_init retry */ 93 93 atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ ··· 102 100 struct bio_list queued_bios; 103 101 104 102 struct timer_list nopath_timer; /* Timeout for queue_if_no_path */ 103 + bool is_suspending; 105 104 }; 106 105 107 106 /* ··· 135 132 #define MPATHF_PG_INIT_DISABLED 4 /* pg_init is not currently allowed */ 136 133 #define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */ 137 134 #define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */ 135 + #define MPATHF_DELAY_PG_SWITCH 7 /* Delay switching pg if it still has paths */ 136 + #define MPATHF_NEED_PG_SWITCH 8 /* Need to switch pgs after the delay has ended */ 138 137 139 138 static bool mpath_double_check_test_bit(int MPATHF_bit, struct multipath *m) 140 139 { ··· 259 254 atomic_set(&m->pg_init_count, 0); 260 255 m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; 261 256 init_waitqueue_head(&m->pg_init_wait); 257 + init_waitqueue_head(&m->probe_wait); 262 258 263 259 return 0; 264 260 } ··· 419 413 goto failed; 420 414 } 421 415 422 - /* Were we instructed to switch PG? */ 423 - if (READ_ONCE(m->next_pg)) { 424 - spin_lock_irqsave(&m->lock, flags); 425 - pg = m->next_pg; 426 - if (!pg) { 427 - spin_unlock_irqrestore(&m->lock, flags); 428 - goto check_current_pg; 429 - } 430 - m->next_pg = NULL; 431 - spin_unlock_irqrestore(&m->lock, flags); 432 - pgpath = choose_path_in_pg(m, pg, nr_bytes); 433 - if (!IS_ERR_OR_NULL(pgpath)) 434 - return pgpath; 435 - } 436 - 437 416 /* Don't change PG until it has no remaining paths */ 438 - check_current_pg: 439 417 pg = READ_ONCE(m->current_pg); 440 418 if (pg) { 441 419 pgpath = choose_path_in_pg(m, pg, nr_bytes); ··· 427 437 return pgpath; 428 438 } 429 439 440 + /* Were we instructed to switch PG? */ 441 + if (READ_ONCE(m->next_pg)) { 442 + spin_lock_irqsave(&m->lock, flags); 443 + pg = m->next_pg; 444 + if (!pg) { 445 + spin_unlock_irqrestore(&m->lock, flags); 446 + goto check_all_pgs; 447 + } 448 + m->next_pg = NULL; 449 + spin_unlock_irqrestore(&m->lock, flags); 450 + pgpath = choose_path_in_pg(m, pg, nr_bytes); 451 + if (!IS_ERR_OR_NULL(pgpath)) 452 + return pgpath; 453 + } 454 + check_all_pgs: 430 455 /* 431 456 * Loop through priority groups until we find a valid path. 432 457 * First time we skip PGs marked 'bypassed'. ··· 617 612 static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) 618 613 { 619 614 struct pgpath *pgpath; 620 - unsigned long flags; 621 615 622 616 /* Do we need to select a new pgpath? */ 623 617 pgpath = READ_ONCE(m->current_pgpath); ··· 624 620 pgpath = choose_pgpath(m, bio->bi_iter.bi_size); 625 621 626 622 if (!pgpath) { 627 - spin_lock_irqsave(&m->lock, flags); 623 + spin_lock_irq(&m->lock); 628 624 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { 629 625 __multipath_queue_bio(m, bio); 630 626 pgpath = ERR_PTR(-EAGAIN); 631 627 } 632 - spin_unlock_irqrestore(&m->lock, flags); 628 + spin_unlock_irq(&m->lock); 633 629 634 630 } else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) || 635 631 mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) { ··· 692 688 static void process_queued_bios(struct work_struct *work) 693 689 { 694 690 int r; 695 - unsigned long flags; 696 691 struct bio *bio; 697 692 struct bio_list bios; 698 693 struct blk_plug plug; ··· 700 697 701 698 bio_list_init(&bios); 702 699 703 - spin_lock_irqsave(&m->lock, flags); 700 + spin_lock_irq(&m->lock); 704 701 705 702 if (bio_list_empty(&m->queued_bios)) { 706 - spin_unlock_irqrestore(&m->lock, flags); 703 + spin_unlock_irq(&m->lock); 707 704 return; 708 705 } 709 706 710 707 bio_list_merge_init(&bios, &m->queued_bios); 711 708 712 - spin_unlock_irqrestore(&m->lock, flags); 709 + spin_unlock_irq(&m->lock); 713 710 714 711 blk_start_plug(&plug); 715 712 while ((bio = bio_list_pop(&bios))) { ··· 1193 1190 struct dm_arg_set as; 1194 1191 unsigned int pg_count = 0; 1195 1192 unsigned int next_pg_num; 1196 - unsigned long flags; 1197 1193 1198 1194 as.argc = argc; 1199 1195 as.argv = argv; ··· 1257 1255 goto bad; 1258 1256 } 1259 1257 1260 - spin_lock_irqsave(&m->lock, flags); 1258 + spin_lock_irq(&m->lock); 1261 1259 enable_nopath_timeout(m); 1262 - spin_unlock_irqrestore(&m->lock, flags); 1260 + spin_unlock_irq(&m->lock); 1263 1261 1264 1262 ti->num_flush_bios = 1; 1265 1263 ti->num_discard_bios = 1; ··· 1294 1292 static void flush_multipath_work(struct multipath *m) 1295 1293 { 1296 1294 if (m->hw_handler_name) { 1297 - unsigned long flags; 1298 - 1299 1295 if (!atomic_read(&m->pg_init_in_progress)) 1300 1296 goto skip; 1301 1297 1302 - spin_lock_irqsave(&m->lock, flags); 1298 + spin_lock_irq(&m->lock); 1303 1299 if (atomic_read(&m->pg_init_in_progress) && 1304 1300 !test_and_set_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) { 1305 - spin_unlock_irqrestore(&m->lock, flags); 1301 + spin_unlock_irq(&m->lock); 1306 1302 1307 1303 flush_workqueue(kmpath_handlerd); 1308 1304 multipath_wait_for_pg_init_completion(m); 1309 1305 1310 - spin_lock_irqsave(&m->lock, flags); 1306 + spin_lock_irq(&m->lock); 1311 1307 clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags); 1312 1308 } 1313 - spin_unlock_irqrestore(&m->lock, flags); 1309 + spin_unlock_irq(&m->lock); 1314 1310 } 1315 1311 skip: 1316 1312 if (m->queue_mode == DM_TYPE_BIO_BASED) ··· 1370 1370 static int reinstate_path(struct pgpath *pgpath) 1371 1371 { 1372 1372 int r = 0, run_queue = 0; 1373 - unsigned long flags; 1374 1373 struct multipath *m = pgpath->pg->m; 1375 1374 unsigned int nr_valid_paths; 1376 1375 1377 - spin_lock_irqsave(&m->lock, flags); 1376 + spin_lock_irq(&m->lock); 1378 1377 1379 1378 if (pgpath->is_active) 1380 1379 goto out; ··· 1403 1404 schedule_work(&m->trigger_event); 1404 1405 1405 1406 out: 1406 - spin_unlock_irqrestore(&m->lock, flags); 1407 + spin_unlock_irq(&m->lock); 1407 1408 if (run_queue) { 1408 1409 dm_table_run_md_queue_async(m->ti->table); 1409 1410 process_queued_io_list(m); ··· 1438 1439 * Temporarily try to avoid having to use the specified PG 1439 1440 */ 1440 1441 static void bypass_pg(struct multipath *m, struct priority_group *pg, 1441 - bool bypassed) 1442 + bool bypassed, bool can_be_delayed) 1442 1443 { 1443 1444 unsigned long flags; 1444 1445 1445 1446 spin_lock_irqsave(&m->lock, flags); 1446 1447 1447 1448 pg->bypassed = bypassed; 1448 - m->current_pgpath = NULL; 1449 - m->current_pg = NULL; 1449 + if (can_be_delayed && test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) 1450 + set_bit(MPATHF_NEED_PG_SWITCH, &m->flags); 1451 + else { 1452 + m->current_pgpath = NULL; 1453 + m->current_pg = NULL; 1454 + } 1450 1455 1451 1456 spin_unlock_irqrestore(&m->lock, flags); 1452 1457 ··· 1464 1461 { 1465 1462 struct priority_group *pg; 1466 1463 unsigned int pgnum; 1467 - unsigned long flags; 1468 1464 char dummy; 1469 1465 1470 1466 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || ··· 1472 1470 return -EINVAL; 1473 1471 } 1474 1472 1475 - spin_lock_irqsave(&m->lock, flags); 1473 + spin_lock_irq(&m->lock); 1476 1474 list_for_each_entry(pg, &m->priority_groups, list) { 1477 1475 pg->bypassed = false; 1478 1476 if (--pgnum) 1479 1477 continue; 1480 1478 1481 - m->current_pgpath = NULL; 1482 - m->current_pg = NULL; 1479 + if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) 1480 + set_bit(MPATHF_NEED_PG_SWITCH, &m->flags); 1481 + else { 1482 + m->current_pgpath = NULL; 1483 + m->current_pg = NULL; 1484 + } 1483 1485 m->next_pg = pg; 1484 1486 } 1485 - spin_unlock_irqrestore(&m->lock, flags); 1487 + spin_unlock_irq(&m->lock); 1486 1488 1487 1489 schedule_work(&m->trigger_event); 1488 1490 return 0; ··· 1513 1507 break; 1514 1508 } 1515 1509 1516 - bypass_pg(m, pg, bypassed); 1510 + bypass_pg(m, pg, bypassed, true); 1517 1511 return 0; 1518 1512 } 1519 1513 ··· 1567 1561 * Probably doing something like FW upgrade on the 1568 1562 * controller so try the other pg. 1569 1563 */ 1570 - bypass_pg(m, pg, true); 1564 + bypass_pg(m, pg, true, false); 1571 1565 break; 1572 1566 case SCSI_DH_RETRY: 1573 1567 /* Wait before retrying. */ ··· 1748 1742 { 1749 1743 struct multipath *m = ti->private; 1750 1744 1745 + spin_lock_irq(&m->lock); 1746 + m->is_suspending = true; 1747 + spin_unlock_irq(&m->lock); 1751 1748 /* FIXME: bio-based shouldn't need to always disable queue_if_no_path */ 1752 1749 if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti)) 1753 1750 queue_if_no_path(m, false, true, __func__); ··· 1771 1762 static void multipath_resume(struct dm_target *ti) 1772 1763 { 1773 1764 struct multipath *m = ti->private; 1774 - unsigned long flags; 1775 1765 1776 - spin_lock_irqsave(&m->lock, flags); 1766 + spin_lock_irq(&m->lock); 1767 + m->is_suspending = false; 1777 1768 if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) { 1778 1769 set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); 1779 1770 clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); ··· 1784 1775 test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags), 1785 1776 test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)); 1786 1777 1787 - spin_unlock_irqrestore(&m->lock, flags); 1778 + spin_unlock_irq(&m->lock); 1788 1779 } 1789 1780 1790 1781 /* ··· 1807 1798 unsigned int status_flags, char *result, unsigned int maxlen) 1808 1799 { 1809 1800 int sz = 0, pg_counter, pgpath_counter; 1810 - unsigned long flags; 1811 1801 struct multipath *m = ti->private; 1812 1802 struct priority_group *pg; 1813 1803 struct pgpath *p; 1814 1804 unsigned int pg_num; 1815 1805 char state; 1816 1806 1817 - spin_lock_irqsave(&m->lock, flags); 1807 + spin_lock_irq(&m->lock); 1818 1808 1819 1809 /* Features */ 1820 1810 if (type == STATUSTYPE_INFO) ··· 1853 1845 1854 1846 DMEMIT("%u ", m->nr_priority_groups); 1855 1847 1856 - if (m->next_pg) 1857 - pg_num = m->next_pg->pg_num; 1858 - else if (m->current_pg) 1848 + if (m->current_pg) 1859 1849 pg_num = m->current_pg->pg_num; 1850 + else if (m->next_pg) 1851 + pg_num = m->next_pg->pg_num; 1860 1852 else 1861 1853 pg_num = (m->nr_priority_groups ? 1 : 0); 1862 1854 ··· 1959 1951 break; 1960 1952 } 1961 1953 1962 - spin_unlock_irqrestore(&m->lock, flags); 1954 + spin_unlock_irq(&m->lock); 1963 1955 } 1964 1956 1965 1957 static int multipath_message(struct dm_target *ti, unsigned int argc, char **argv, ··· 1969 1961 dev_t dev; 1970 1962 struct multipath *m = ti->private; 1971 1963 action_fn action; 1972 - unsigned long flags; 1973 1964 1974 1965 mutex_lock(&m->work_mutex); 1975 1966 ··· 1980 1973 if (argc == 1) { 1981 1974 if (!strcasecmp(argv[0], "queue_if_no_path")) { 1982 1975 r = queue_if_no_path(m, true, false, __func__); 1983 - spin_lock_irqsave(&m->lock, flags); 1976 + spin_lock_irq(&m->lock); 1984 1977 enable_nopath_timeout(m); 1985 - spin_unlock_irqrestore(&m->lock, flags); 1978 + spin_unlock_irq(&m->lock); 1986 1979 goto out; 1987 1980 } else if (!strcasecmp(argv[0], "fail_if_no_path")) { 1988 1981 r = queue_if_no_path(m, false, false, __func__); ··· 2028 2021 return r; 2029 2022 } 2030 2023 2024 + /* 2025 + * Perform a minimal read from the given path to find out whether the 2026 + * path still works. If a path error occurs, fail it. 2027 + */ 2028 + static int probe_path(struct pgpath *pgpath) 2029 + { 2030 + struct block_device *bdev = pgpath->path.dev->bdev; 2031 + unsigned int read_size = bdev_logical_block_size(bdev); 2032 + struct page *page; 2033 + struct bio *bio; 2034 + blk_status_t status; 2035 + int r = 0; 2036 + 2037 + if (WARN_ON_ONCE(read_size > PAGE_SIZE)) 2038 + return -EINVAL; 2039 + 2040 + page = alloc_page(GFP_KERNEL); 2041 + if (!page) 2042 + return -ENOMEM; 2043 + 2044 + /* Perform a minimal read: Sector 0, length read_size */ 2045 + bio = bio_alloc(bdev, 1, REQ_OP_READ, GFP_KERNEL); 2046 + if (!bio) { 2047 + r = -ENOMEM; 2048 + goto out; 2049 + } 2050 + 2051 + bio->bi_iter.bi_sector = 0; 2052 + __bio_add_page(bio, page, read_size, 0); 2053 + submit_bio_wait(bio); 2054 + status = bio->bi_status; 2055 + bio_put(bio); 2056 + 2057 + if (status && blk_path_error(status)) 2058 + fail_path(pgpath); 2059 + 2060 + out: 2061 + __free_page(page); 2062 + return r; 2063 + } 2064 + 2065 + /* 2066 + * Probe all active paths in current_pg to find out whether they still work. 2067 + * Fail all paths that do not work. 2068 + * 2069 + * Return -ENOTCONN if no valid path is left (even outside of current_pg). We 2070 + * cannot probe paths in other pgs without switching current_pg, so if valid 2071 + * paths are only in different pgs, they may or may not work. Additionally 2072 + * we should not probe paths in a pathgroup that is in the process of 2073 + * Initializing. Userspace can submit a request and we'll switch and wait 2074 + * for the pathgroup to be initialized. If the request fails, it may need to 2075 + * probe again. 2076 + */ 2077 + static int probe_active_paths(struct multipath *m) 2078 + { 2079 + struct pgpath *pgpath; 2080 + struct priority_group *pg = NULL; 2081 + int r = 0; 2082 + 2083 + spin_lock_irq(&m->lock); 2084 + if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) { 2085 + wait_event_lock_irq(m->probe_wait, 2086 + !test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags), 2087 + m->lock); 2088 + /* 2089 + * if we waited because a probe was already in progress, 2090 + * and it probed the current active pathgroup, don't 2091 + * reprobe. Just return the number of valid paths 2092 + */ 2093 + if (m->current_pg == m->last_probed_pg) 2094 + goto skip_probe; 2095 + } 2096 + if (!m->current_pg || m->is_suspending || 2097 + test_bit(MPATHF_QUEUE_IO, &m->flags)) 2098 + goto skip_probe; 2099 + set_bit(MPATHF_DELAY_PG_SWITCH, &m->flags); 2100 + pg = m->last_probed_pg = m->current_pg; 2101 + spin_unlock_irq(&m->lock); 2102 + 2103 + list_for_each_entry(pgpath, &pg->pgpaths, list) { 2104 + if (pg != READ_ONCE(m->current_pg) || 2105 + READ_ONCE(m->is_suspending)) 2106 + goto out; 2107 + if (!pgpath->is_active) 2108 + continue; 2109 + 2110 + r = probe_path(pgpath); 2111 + if (r < 0) 2112 + goto out; 2113 + } 2114 + 2115 + out: 2116 + spin_lock_irq(&m->lock); 2117 + clear_bit(MPATHF_DELAY_PG_SWITCH, &m->flags); 2118 + if (test_and_clear_bit(MPATHF_NEED_PG_SWITCH, &m->flags)) { 2119 + m->current_pgpath = NULL; 2120 + m->current_pg = NULL; 2121 + } 2122 + skip_probe: 2123 + if (r == 0 && !atomic_read(&m->nr_valid_paths)) 2124 + r = -ENOTCONN; 2125 + spin_unlock_irq(&m->lock); 2126 + if (pg) 2127 + wake_up(&m->probe_wait); 2128 + return r; 2129 + } 2130 + 2031 2131 static int multipath_prepare_ioctl(struct dm_target *ti, 2032 - struct block_device **bdev) 2132 + struct block_device **bdev, 2133 + unsigned int cmd, unsigned long arg, 2134 + bool *forward) 2033 2135 { 2034 2136 struct multipath *m = ti->private; 2035 2137 struct pgpath *pgpath; 2036 - unsigned long flags; 2037 2138 int r; 2139 + 2140 + if (_IOC_TYPE(cmd) == DM_IOCTL) { 2141 + *forward = false; 2142 + switch (cmd) { 2143 + case DM_MPATH_PROBE_PATHS: 2144 + return probe_active_paths(m); 2145 + default: 2146 + return -ENOTTY; 2147 + } 2148 + } 2038 2149 2039 2150 pgpath = READ_ONCE(m->current_pgpath); 2040 2151 if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) ··· 2169 2044 } else { 2170 2045 /* No path is available */ 2171 2046 r = -EIO; 2172 - spin_lock_irqsave(&m->lock, flags); 2047 + spin_lock_irq(&m->lock); 2173 2048 if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) 2174 2049 r = -ENOTCONN; 2175 - spin_unlock_irqrestore(&m->lock, flags); 2050 + spin_unlock_irq(&m->lock); 2176 2051 } 2177 2052 2178 2053 if (r == -ENOTCONN) { ··· 2180 2055 /* Path status changed, redo selection */ 2181 2056 (void) choose_pgpath(m, 0); 2182 2057 } 2183 - spin_lock_irqsave(&m->lock, flags); 2058 + spin_lock_irq(&m->lock); 2184 2059 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) 2185 2060 (void) __pg_init_all_paths(m); 2186 - spin_unlock_irqrestore(&m->lock, flags); 2061 + spin_unlock_irq(&m->lock); 2187 2062 dm_table_run_md_queue_async(m->ti->table); 2188 2063 process_queued_io_list(m); 2189 2064 } ··· 2305 2180 */ 2306 2181 static struct target_type multipath_target = { 2307 2182 .name = "multipath", 2308 - .version = {1, 14, 0}, 2183 + .version = {1, 15, 0}, 2309 2184 .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE | 2310 2185 DM_TARGET_PASSES_INTEGRITY, 2311 2186 .module = THIS_MODULE,

+2 -3

drivers/md/dm-raid1.c

··· 133 133 spin_lock_irqsave(&ms->lock, flags); 134 134 should_wake = !(bl->head); 135 135 bio_list_add(bl, bio); 136 - spin_unlock_irqrestore(&ms->lock, flags); 137 - 138 136 if (should_wake) 139 137 wakeup_mirrord(ms); 138 + spin_unlock_irqrestore(&ms->lock, flags); 140 139 } 141 140 142 141 static void dispatch_bios(void *context, struct bio_list *bio_list) ··· 645 646 if (!ms->failures.head) 646 647 should_wake = 1; 647 648 bio_list_add(&ms->failures, bio); 648 - spin_unlock_irqrestore(&ms->lock, flags); 649 649 if (should_wake) 650 650 wakeup_mirrord(ms); 651 + spin_unlock_irqrestore(&ms->lock, flags); 651 652 } 652 653 653 654 static void do_write(struct mirror_set *ms, struct bio *bio)

+2 -2

drivers/md/dm-rq.c

··· 217 217 if (unlikely(error == BLK_STS_TARGET)) { 218 218 if (req_op(clone) == REQ_OP_DISCARD && 219 219 !clone->q->limits.max_discard_sectors) 220 - disable_discard(tio->md); 220 + blk_queue_disable_discard(tio->md->queue); 221 221 else if (req_op(clone) == REQ_OP_WRITE_ZEROES && 222 222 !clone->q->limits.max_write_zeroes_sectors) 223 - disable_write_zeroes(tio->md); 223 + blk_queue_disable_write_zeroes(tio->md->queue); 224 224 } 225 225 226 226 switch (r) {

+2 -3

drivers/md/dm-stripe.c

··· 405 405 blk_status_t *error) 406 406 { 407 407 unsigned int i; 408 - char major_minor[16]; 408 + char major_minor[22]; 409 409 struct stripe_c *sc = ti->private; 410 410 411 411 if (!*error) ··· 417 417 if (*error == BLK_STS_NOTSUPP) 418 418 return DM_ENDIO_DONE; 419 419 420 - memset(major_minor, 0, sizeof(major_minor)); 421 - sprintf(major_minor, "%d:%d", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio))); 420 + format_dev_t(major_minor, bio_dev(bio)); 422 421 423 422 /* 424 423 * Test to see which stripe drive triggered the event

+3 -1

drivers/md/dm-switch.c

··· 517 517 * 518 518 * Passthrough all ioctls to the path for sector 0 519 519 */ 520 - static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) 520 + static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, 521 + unsigned int cmd, unsigned long arg, 522 + bool *forward) 521 523 { 522 524 struct switch_ctx *sctx = ti->private; 523 525 unsigned int path_nr;

+245 -18

drivers/md/dm-table.c

··· 117 117 n_targets = (struct dm_target *) (n_highs + num); 118 118 119 119 memset(n_highs, -1, sizeof(*n_highs) * num); 120 - kvfree(t->highs); 121 120 122 121 t->num_allocated = num; 123 122 t->highs = n_highs; ··· 256 257 if (bdev_is_zoned(bdev)) { 257 258 unsigned int zone_sectors = bdev_zone_sectors(bdev); 258 259 259 - if (start & (zone_sectors - 1)) { 260 + if (!bdev_is_zone_aligned(bdev, start)) { 260 261 DMERR("%s: start=%llu not aligned to h/w zone size %u of %pg", 261 262 dm_device_name(ti->table->md), 262 263 (unsigned long long)start, ··· 273 274 * devices do not end up with a smaller zone in the middle of 274 275 * the sector range. 275 276 */ 276 - if (len & (zone_sectors - 1)) { 277 + if (!bdev_is_zone_aligned(bdev, len)) { 277 278 DMERR("%s: len=%llu not aligned to h/w zone size %u of %pg", 278 279 dm_device_name(ti->table->md), 279 280 (unsigned long long)len, ··· 430 431 return 0; 431 432 } 432 433 434 + mutex_lock(&q->limits_lock); 435 + /* 436 + * BLK_FEAT_ATOMIC_WRITES is not inherited from the bottom device in 437 + * blk_stack_limits(), so do it manually. 438 + */ 439 + limits->features |= (q->limits.features & BLK_FEAT_ATOMIC_WRITES); 440 + 433 441 if (blk_stack_limits(limits, &q->limits, 434 442 get_start_sect(bdev) + start) < 0) 435 443 DMWARN("%s: adding target device %pg caused an alignment inconsistency: " ··· 454 448 */ 455 449 if (!dm_target_has_integrity(ti->type)) 456 450 queue_limits_stack_integrity_bdev(limits, bdev); 451 + mutex_unlock(&q->limits_lock); 457 452 return 0; 458 453 } 459 454 ··· 1196 1189 return 0; 1197 1190 } 1198 1191 1192 + enum dm_wrappedkey_op { 1193 + DERIVE_SW_SECRET, 1194 + IMPORT_KEY, 1195 + GENERATE_KEY, 1196 + PREPARE_KEY, 1197 + }; 1198 + 1199 + struct dm_wrappedkey_op_args { 1200 + enum dm_wrappedkey_op op; 1201 + int err; 1202 + union { 1203 + struct { 1204 + const u8 *eph_key; 1205 + size_t eph_key_size; 1206 + u8 *sw_secret; 1207 + } derive_sw_secret; 1208 + struct { 1209 + const u8 *raw_key; 1210 + size_t raw_key_size; 1211 + u8 *lt_key; 1212 + } import_key; 1213 + struct { 1214 + u8 *lt_key; 1215 + } generate_key; 1216 + struct { 1217 + const u8 *lt_key; 1218 + size_t lt_key_size; 1219 + u8 *eph_key; 1220 + } prepare_key; 1221 + }; 1222 + }; 1223 + 1224 + static int dm_wrappedkey_op_callback(struct dm_target *ti, struct dm_dev *dev, 1225 + sector_t start, sector_t len, void *data) 1226 + { 1227 + struct dm_wrappedkey_op_args *args = data; 1228 + struct block_device *bdev = dev->bdev; 1229 + struct blk_crypto_profile *profile = 1230 + bdev_get_queue(bdev)->crypto_profile; 1231 + int err = -EOPNOTSUPP; 1232 + 1233 + if (!args->err) 1234 + return 0; 1235 + 1236 + switch (args->op) { 1237 + case DERIVE_SW_SECRET: 1238 + err = blk_crypto_derive_sw_secret( 1239 + bdev, 1240 + args->derive_sw_secret.eph_key, 1241 + args->derive_sw_secret.eph_key_size, 1242 + args->derive_sw_secret.sw_secret); 1243 + break; 1244 + case IMPORT_KEY: 1245 + err = blk_crypto_import_key(profile, 1246 + args->import_key.raw_key, 1247 + args->import_key.raw_key_size, 1248 + args->import_key.lt_key); 1249 + break; 1250 + case GENERATE_KEY: 1251 + err = blk_crypto_generate_key(profile, 1252 + args->generate_key.lt_key); 1253 + break; 1254 + case PREPARE_KEY: 1255 + err = blk_crypto_prepare_key(profile, 1256 + args->prepare_key.lt_key, 1257 + args->prepare_key.lt_key_size, 1258 + args->prepare_key.eph_key); 1259 + break; 1260 + } 1261 + args->err = err; 1262 + 1263 + /* Try another device in case this fails. */ 1264 + return 0; 1265 + } 1266 + 1267 + static int dm_exec_wrappedkey_op(struct blk_crypto_profile *profile, 1268 + struct dm_wrappedkey_op_args *args) 1269 + { 1270 + struct mapped_device *md = 1271 + container_of(profile, struct dm_crypto_profile, profile)->md; 1272 + struct dm_target *ti; 1273 + struct dm_table *t; 1274 + int srcu_idx; 1275 + int i; 1276 + 1277 + args->err = -EOPNOTSUPP; 1278 + 1279 + t = dm_get_live_table(md, &srcu_idx); 1280 + if (!t) 1281 + goto out; 1282 + 1283 + /* 1284 + * blk-crypto currently has no support for multiple incompatible 1285 + * implementations of wrapped inline crypto keys on a single system. 1286 + * It was already checked earlier that support for wrapped keys was 1287 + * declared on all underlying devices. Thus, all the underlying devices 1288 + * should support all wrapped key operations and they should behave 1289 + * identically, i.e. work with the same keys. So, just executing the 1290 + * operation on the first device on which it works suffices for now. 1291 + */ 1292 + for (i = 0; i < t->num_targets; i++) { 1293 + ti = dm_table_get_target(t, i); 1294 + if (!ti->type->iterate_devices) 1295 + continue; 1296 + ti->type->iterate_devices(ti, dm_wrappedkey_op_callback, args); 1297 + if (!args->err) 1298 + break; 1299 + } 1300 + out: 1301 + dm_put_live_table(md, srcu_idx); 1302 + return args->err; 1303 + } 1304 + 1305 + static int dm_derive_sw_secret(struct blk_crypto_profile *profile, 1306 + const u8 *eph_key, size_t eph_key_size, 1307 + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]) 1308 + { 1309 + struct dm_wrappedkey_op_args args = { 1310 + .op = DERIVE_SW_SECRET, 1311 + .derive_sw_secret = { 1312 + .eph_key = eph_key, 1313 + .eph_key_size = eph_key_size, 1314 + .sw_secret = sw_secret, 1315 + }, 1316 + }; 1317 + return dm_exec_wrappedkey_op(profile, &args); 1318 + } 1319 + 1320 + static int dm_import_key(struct blk_crypto_profile *profile, 1321 + const u8 *raw_key, size_t raw_key_size, 1322 + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) 1323 + { 1324 + struct dm_wrappedkey_op_args args = { 1325 + .op = IMPORT_KEY, 1326 + .import_key = { 1327 + .raw_key = raw_key, 1328 + .raw_key_size = raw_key_size, 1329 + .lt_key = lt_key, 1330 + }, 1331 + }; 1332 + return dm_exec_wrappedkey_op(profile, &args); 1333 + } 1334 + 1335 + static int dm_generate_key(struct blk_crypto_profile *profile, 1336 + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) 1337 + { 1338 + struct dm_wrappedkey_op_args args = { 1339 + .op = GENERATE_KEY, 1340 + .generate_key = { 1341 + .lt_key = lt_key, 1342 + }, 1343 + }; 1344 + return dm_exec_wrappedkey_op(profile, &args); 1345 + } 1346 + 1347 + static int dm_prepare_key(struct blk_crypto_profile *profile, 1348 + const u8 *lt_key, size_t lt_key_size, 1349 + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) 1350 + { 1351 + struct dm_wrappedkey_op_args args = { 1352 + .op = PREPARE_KEY, 1353 + .prepare_key = { 1354 + .lt_key = lt_key, 1355 + .lt_key_size = lt_key_size, 1356 + .eph_key = eph_key, 1357 + }, 1358 + }; 1359 + return dm_exec_wrappedkey_op(profile, &args); 1360 + } 1361 + 1199 1362 static int 1200 1363 device_intersect_crypto_capabilities(struct dm_target *ti, struct dm_dev *dev, 1201 1364 sector_t start, sector_t len, void *data) ··· 1438 1261 ti->type->iterate_devices(ti, 1439 1262 device_intersect_crypto_capabilities, 1440 1263 profile); 1264 + } 1265 + 1266 + if (profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED) { 1267 + profile->ll_ops.derive_sw_secret = dm_derive_sw_secret; 1268 + profile->ll_ops.import_key = dm_import_key; 1269 + profile->ll_ops.generate_key = dm_generate_key; 1270 + profile->ll_ops.prepare_key = dm_prepare_key; 1441 1271 } 1442 1272 1443 1273 if (t->md->queue && ··· 1668 1484 1669 1485 ti->type->iterate_devices(ti, count_device, &num_devices); 1670 1486 if (num_devices) 1487 + return false; 1488 + } 1489 + 1490 + return true; 1491 + } 1492 + 1493 + bool dm_table_is_wildcard(struct dm_table *t) 1494 + { 1495 + for (unsigned int i = 0; i < t->num_targets; i++) { 1496 + struct dm_target *ti = dm_table_get_target(t, i); 1497 + 1498 + if (!dm_target_is_wildcard(ti->type)) 1671 1499 return false; 1672 1500 } 1673 1501 ··· 1917 1721 sector_t start, sector_t len, void *data) 1918 1722 { 1919 1723 struct request_queue *q = bdev_get_queue(dev->bdev); 1724 + int b; 1920 1725 1921 - return !q->limits.max_write_zeroes_sectors; 1726 + mutex_lock(&q->limits_lock); 1727 + b = !q->limits.max_write_zeroes_sectors; 1728 + mutex_unlock(&q->limits_lock); 1729 + return b; 1922 1730 } 1923 1731 1924 1732 static bool dm_table_supports_write_zeroes(struct dm_table *t) ··· 2030 1830 return true; 2031 1831 } 2032 1832 1833 + bool dm_table_supports_size_change(struct dm_table *t, sector_t old_size, 1834 + sector_t new_size) 1835 + { 1836 + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && dm_has_zone_plugs(t->md) && 1837 + old_size != new_size) { 1838 + DMWARN("%s: device has zone write plug resources. " 1839 + "Cannot change size", 1840 + dm_device_name(t->md)); 1841 + return false; 1842 + } 1843 + return true; 1844 + } 1845 + 2033 1846 int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, 2034 1847 struct queue_limits *limits) 2035 1848 { 2036 1849 int r; 1850 + struct queue_limits old_limits; 2037 1851 2038 1852 if (!dm_table_supports_nowait(t)) 2039 1853 limits->features &= ~BLK_FEAT_NOWAIT; ··· 2074 1860 if (dm_table_supports_flush(t)) 2075 1861 limits->features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA; 2076 1862 2077 - if (dm_table_supports_dax(t, device_not_dax_capable)) { 1863 + if (dm_table_supports_dax(t, device_not_dax_capable)) 2078 1864 limits->features |= BLK_FEAT_DAX; 2079 - if (dm_table_supports_dax(t, device_not_dax_synchronous_capable)) 2080 - set_dax_synchronous(t->md->dax_dev); 2081 - } else 1865 + else 2082 1866 limits->features &= ~BLK_FEAT_DAX; 2083 1867 2084 - if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) 2085 - dax_write_cache(t->md->dax_dev, true); 2086 - 2087 1868 /* For a zoned table, setup the zone related queue attributes. */ 2088 - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && 2089 - (limits->features & BLK_FEAT_ZONED)) { 2090 - r = dm_set_zones_restrictions(t, q, limits); 2091 - if (r) 2092 - return r; 1869 + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { 1870 + if (limits->features & BLK_FEAT_ZONED) { 1871 + r = dm_set_zones_restrictions(t, q, limits); 1872 + if (r) 1873 + return r; 1874 + } else if (dm_has_zone_plugs(t->md)) { 1875 + DMWARN("%s: device has zone write plug resources. " 1876 + "Cannot switch to non-zoned table.", 1877 + dm_device_name(t->md)); 1878 + return -EINVAL; 1879 + } 2093 1880 } 2094 1881 2095 1882 if (dm_table_supports_atomic_writes(t)) 2096 1883 limits->features |= BLK_FEAT_ATOMIC_WRITES; 2097 1884 2098 - r = queue_limits_set(q, limits); 1885 + old_limits = queue_limits_start_update(q); 1886 + r = queue_limits_commit_update(q, limits); 2099 1887 if (r) 2100 1888 return r; 2101 1889 ··· 2108 1892 if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && 2109 1893 (limits->features & BLK_FEAT_ZONED)) { 2110 1894 r = dm_revalidate_zones(t, q); 2111 - if (r) 1895 + if (r) { 1896 + queue_limits_set(q, &old_limits); 2112 1897 return r; 1898 + } 2113 1899 } 1900 + 1901 + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) 1902 + dm_finalize_zone_settings(t, limits); 1903 + 1904 + if (dm_table_supports_dax(t, device_not_dax_synchronous_capable)) 1905 + set_dax_synchronous(t->md->dax_dev); 1906 + 1907 + if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) 1908 + dax_write_cache(t->md->dax_dev, true); 2114 1909 2115 1910 dm_update_crypto_profile(q, t); 2116 1911 return 0;

+13 -11

drivers/md/dm-vdo/indexer/volume.c

··· 754 754 u32 physical_page, struct cached_page **page_ptr) 755 755 { 756 756 struct cached_page *page; 757 + unsigned int zone_number = request->zone_number; 757 758 758 759 get_page_from_cache(&volume->page_cache, physical_page, &page); 759 760 if (page != NULL) { 760 - if (request->zone_number == 0) { 761 + if (zone_number == 0) { 761 762 /* Only one zone is allowed to update the LRU. */ 762 763 make_page_most_recent(&volume->page_cache, page); 763 764 } ··· 768 767 } 769 768 770 769 /* Prepare to enqueue a read for the page. */ 771 - end_pending_search(&volume->page_cache, request->zone_number); 770 + end_pending_search(&volume->page_cache, zone_number); 772 771 mutex_lock(&volume->read_threads_mutex); 773 772 774 773 /* ··· 788 787 * the order does not matter for correctness as it does below. 789 788 */ 790 789 mutex_unlock(&volume->read_threads_mutex); 791 - begin_pending_search(&volume->page_cache, physical_page, 792 - request->zone_number); 790 + begin_pending_search(&volume->page_cache, physical_page, zone_number); 793 791 return UDS_QUEUED; 794 792 } 795 793 ··· 797 797 * "search pending" state in careful order so no other thread can mess with the data before 798 798 * the caller gets to look at it. 799 799 */ 800 - begin_pending_search(&volume->page_cache, physical_page, request->zone_number); 800 + begin_pending_search(&volume->page_cache, physical_page, zone_number); 801 801 mutex_unlock(&volume->read_threads_mutex); 802 802 *page_ptr = page; 803 803 return UDS_SUCCESS; ··· 849 849 { 850 850 int result; 851 851 struct cached_page *page = NULL; 852 + unsigned int zone_number = request->zone_number; 852 853 u32 physical_page = map_to_physical_page(volume->geometry, chapter, 853 854 index_page_number); 854 855 ··· 859 858 * invalidation by the reader thread, before the reader thread has noticed that the 860 859 * invalidate_counter has been incremented. 861 860 */ 862 - begin_pending_search(&volume->page_cache, physical_page, request->zone_number); 861 + begin_pending_search(&volume->page_cache, physical_page, zone_number); 863 862 864 863 result = get_volume_page_protected(volume, request, physical_page, &page); 865 864 if (result != UDS_SUCCESS) { 866 - end_pending_search(&volume->page_cache, request->zone_number); 865 + end_pending_search(&volume->page_cache, zone_number); 867 866 return result; 868 867 } 869 868 870 869 result = uds_search_chapter_index_page(&page->index_page, volume->geometry, 871 870 &request->record_name, 872 871 record_page_number); 873 - end_pending_search(&volume->page_cache, request->zone_number); 872 + end_pending_search(&volume->page_cache, zone_number); 874 873 return result; 875 874 } 876 875 ··· 883 882 { 884 883 struct cached_page *record_page; 885 884 struct index_geometry *geometry = volume->geometry; 885 + unsigned int zone_number = request->zone_number; 886 886 int result; 887 887 u32 physical_page, page_number; 888 888 ··· 907 905 * invalidation by the reader thread, before the reader thread has noticed that the 908 906 * invalidate_counter has been incremented. 909 907 */ 910 - begin_pending_search(&volume->page_cache, physical_page, request->zone_number); 908 + begin_pending_search(&volume->page_cache, physical_page, zone_number); 911 909 912 910 result = get_volume_page_protected(volume, request, physical_page, &record_page); 913 911 if (result != UDS_SUCCESS) { 914 - end_pending_search(&volume->page_cache, request->zone_number); 912 + end_pending_search(&volume->page_cache, zone_number); 915 913 return result; 916 914 } 917 915 ··· 919 917 &request->record_name, geometry, &request->old_metadata)) 920 918 *found = true; 921 919 922 - end_pending_search(&volume->page_cache, request->zone_number); 920 + end_pending_search(&volume->page_cache, zone_number); 923 921 return UDS_SUCCESS; 924 922 } 925 923

+4

drivers/md/dm-verity-fec.c

··· 593 593 (*argc)--; 594 594 595 595 if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV)) { 596 + if (v->fec->dev) { 597 + ti->error = "FEC device already specified"; 598 + return -EINVAL; 599 + } 596 600 r = dm_get_device(ti, arg_value, BLK_OPEN_READ, &v->fec->dev); 597 601 if (r) { 598 602 ti->error = "FEC device lookup failed";

+12 -3

drivers/md/dm-verity-target.c

··· 682 682 static inline bool verity_use_bh(unsigned int bytes, unsigned short ioprio) 683 683 { 684 684 return ioprio <= IOPRIO_CLASS_IDLE && 685 - bytes <= READ_ONCE(dm_verity_use_bh_bytes[ioprio]); 685 + bytes <= READ_ONCE(dm_verity_use_bh_bytes[ioprio]) && 686 + !need_resched(); 686 687 } 687 688 688 689 static void verity_end_io(struct bio *bio) ··· 994 993 } 995 994 } 996 995 997 - static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) 996 + static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, 997 + unsigned int cmd, unsigned long arg, 998 + bool *forward) 998 999 { 999 1000 struct dm_verity *v = ti->private; 1000 1001 ··· 1123 1120 { 1124 1121 struct dm_target *ti = v->ti; 1125 1122 1123 + if (v->validated_blocks) 1124 + return 0; 1125 + 1126 1126 /* the bitset can only handle INT_MAX blocks */ 1127 1127 if (v->data_blocks > INT_MAX) { 1128 1128 ti->error = "device too large to use check_at_most_once"; ··· 1148 1142 int r = -ENOMEM; 1149 1143 struct dm_verity_io *io; 1150 1144 u8 *zero_data; 1145 + 1146 + if (v->zero_digest) 1147 + return 0; 1151 1148 1152 1149 v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL); 1153 1150 ··· 1586 1577 goto bad; 1587 1578 } 1588 1579 1589 - /* Root hash signature is a optional parameter*/ 1580 + /* Root hash signature is an optional parameter */ 1590 1581 r = verity_verify_root_hash(root_hash_digest_to_validate, 1591 1582 strlen(root_hash_digest_to_validate), 1592 1583 verify_args.sig,

+13 -4

drivers/md/dm-verity-verify-sig.c

··· 71 71 const char *arg_name) 72 72 { 73 73 struct dm_target *ti = v->ti; 74 - int ret = 0; 74 + int ret; 75 75 const char *sig_key = NULL; 76 + 77 + if (v->signature_key_desc) { 78 + ti->error = DM_VERITY_VERIFY_ERR("root_hash_sig_key_desc already specified"); 79 + return -EINVAL; 80 + } 76 81 77 82 if (!*argc) { 78 83 ti->error = DM_VERITY_VERIFY_ERR("Signature key not specified"); ··· 88 83 (*argc)--; 89 84 90 85 ret = verity_verify_get_sig_from_key(sig_key, sig_opts); 91 - if (ret < 0) 86 + if (ret < 0) { 92 87 ti->error = DM_VERITY_VERIFY_ERR("Invalid key specified"); 88 + return ret; 89 + } 93 90 94 91 v->signature_key_desc = kstrdup(sig_key, GFP_KERNEL); 95 - if (!v->signature_key_desc) 92 + if (!v->signature_key_desc) { 93 + ti->error = DM_VERITY_VERIFY_ERR("Could not allocate memory for signature key"); 96 94 return -ENOMEM; 95 + } 97 96 98 - return ret; 97 + return 0; 99 98 } 100 99 101 100 /*

+69 -29

drivers/md/dm-zone.c

··· 56 56 { 57 57 struct mapped_device *md = disk->private_data; 58 58 struct dm_table *map; 59 - int srcu_idx, ret; 59 + struct dm_table *zone_revalidate_map = md->zone_revalidate_map; 60 + int srcu_idx, ret = -EIO; 61 + bool put_table = false; 60 62 61 - if (!md->zone_revalidate_map) { 62 - /* Regular user context */ 63 + if (!zone_revalidate_map || md->revalidate_map_task != current) { 64 + /* 65 + * Regular user context or 66 + * Zone revalidation during __bind() is in progress, but this 67 + * call is from a different process 68 + */ 63 69 if (dm_suspended_md(md)) 64 70 return -EAGAIN; 65 71 66 72 map = dm_get_live_table(md, &srcu_idx); 67 - if (!map) 68 - return -EIO; 73 + put_table = true; 69 74 } else { 70 75 /* Zone revalidation during __bind() */ 71 - map = md->zone_revalidate_map; 76 + map = zone_revalidate_map; 72 77 } 73 78 74 - ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data); 79 + if (map) 80 + ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, 81 + data); 75 82 76 - if (!md->zone_revalidate_map) 83 + if (put_table) 77 84 dm_put_live_table(md, srcu_idx); 78 85 79 86 return ret; ··· 160 153 { 161 154 struct mapped_device *md = t->md; 162 155 struct gendisk *disk = md->disk; 156 + unsigned int nr_zones = disk->nr_zones; 163 157 int ret; 164 158 165 159 if (!get_capacity(disk)) 166 160 return 0; 167 161 168 - /* Revalidate only if something changed. */ 169 - if (!disk->nr_zones || disk->nr_zones != md->nr_zones) { 170 - DMINFO("%s using %s zone append", 171 - disk->disk_name, 172 - queue_emulates_zone_append(q) ? "emulated" : "native"); 173 - md->nr_zones = 0; 174 - } 175 - 176 - if (md->nr_zones) 162 + /* 163 + * Do not revalidate if zone write plug resources have already 164 + * been allocated. 165 + */ 166 + if (dm_has_zone_plugs(md)) 177 167 return 0; 168 + 169 + DMINFO("%s using %s zone append", disk->disk_name, 170 + queue_emulates_zone_append(q) ? "emulated" : "native"); 178 171 179 172 /* 180 173 * Our table is not live yet. So the call to dm_get_live_table() ··· 182 175 * our table for dm_blk_report_zones() to use directly. 183 176 */ 184 177 md->zone_revalidate_map = t; 178 + md->revalidate_map_task = current; 185 179 ret = blk_revalidate_disk_zones(disk); 180 + md->revalidate_map_task = NULL; 186 181 md->zone_revalidate_map = NULL; 187 182 188 183 if (ret) { 189 184 DMERR("Revalidate zones failed %d", ret); 185 + disk->nr_zones = nr_zones; 190 186 return ret; 191 187 } 192 188 ··· 347 337 348 338 /* 349 339 * Check if zone append is natively supported, and if not, set the 350 - * mapped device queue as needing zone append emulation. 340 + * mapped device queue as needing zone append emulation. If zone 341 + * append is natively supported, make sure that 342 + * max_hw_zone_append_sectors is not set to 0. 351 343 */ 352 344 WARN_ON_ONCE(queue_is_mq(q)); 353 - if (dm_table_supports_zone_append(t)) { 354 - clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); 355 - } else { 356 - set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); 345 + if (!dm_table_supports_zone_append(t)) 357 346 lim->max_hw_zone_append_sectors = 0; 358 - } 347 + else if (lim->max_hw_zone_append_sectors == 0) 348 + lim->max_hw_zone_append_sectors = lim->max_zone_append_sectors; 359 349 360 350 /* 361 351 * Determine the max open and max active zone limits for the mapped ··· 390 380 lim->max_open_zones = 0; 391 381 lim->max_active_zones = 0; 392 382 lim->max_hw_zone_append_sectors = 0; 383 + lim->max_zone_append_sectors = 0; 393 384 lim->zone_write_granularity = 0; 394 385 lim->chunk_sectors = 0; 395 386 lim->features &= ~BLK_FEAT_ZONED; 396 - clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); 397 - md->nr_zones = 0; 398 - disk->nr_zones = 0; 399 387 return 0; 400 388 } 401 389 390 + if (get_capacity(disk) && dm_has_zone_plugs(t->md)) { 391 + if (q->limits.chunk_sectors != lim->chunk_sectors) { 392 + DMWARN("%s: device has zone write plug resources. " 393 + "Cannot change zone size", 394 + disk->disk_name); 395 + return -EINVAL; 396 + } 397 + if (lim->max_hw_zone_append_sectors != 0 && 398 + !dm_table_is_wildcard(t)) { 399 + DMWARN("%s: device has zone write plug resources. " 400 + "New table must emulate zone append", 401 + disk->disk_name); 402 + return -EINVAL; 403 + } 404 + } 402 405 /* 403 406 * Warn once (when the capacity is not yet set) if the mapped device is 404 407 * partially using zone resources of the target devices as that leads to ··· 431 408 return 0; 432 409 } 433 410 411 + void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim) 412 + { 413 + struct mapped_device *md = t->md; 414 + 415 + if (lim->features & BLK_FEAT_ZONED) { 416 + if (dm_table_supports_zone_append(t)) 417 + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); 418 + else 419 + set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); 420 + } else { 421 + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); 422 + md->nr_zones = 0; 423 + md->disk->nr_zones = 0; 424 + } 425 + } 426 + 427 + 434 428 /* 435 429 * IO completion callback called from clone_endio(). 436 430 */ ··· 463 423 */ 464 424 if (clone->bi_status == BLK_STS_OK && 465 425 bio_op(clone) == REQ_OP_ZONE_APPEND) { 466 - sector_t mask = bdev_zone_sectors(disk->part0) - 1; 467 - 468 - orig_bio->bi_iter.bi_sector += clone->bi_iter.bi_sector & mask; 426 + orig_bio->bi_iter.bi_sector += 427 + bdev_offset_from_zone_start(disk->part0, 428 + clone->bi_iter.bi_sector); 469 429 } 470 430 471 431 return;

+2 -1

drivers/md/dm-zoned-target.c

··· 1015 1015 /* 1016 1016 * Pass on ioctl to the backend device. 1017 1017 */ 1018 - static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) 1018 + static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, 1019 + unsigned int cmd, unsigned long arg, bool *forward) 1019 1020 { 1020 1021 struct dmz_target *dmz = ti->private; 1021 1022 struct dmz_dev *dev = &dmz->dev[0];

+35 -38

drivers/md/dm.c

··· 411 411 } 412 412 413 413 static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, 414 - struct block_device **bdev) 414 + struct block_device **bdev, unsigned int cmd, 415 + unsigned long arg, bool *forward) 415 416 { 416 417 struct dm_target *ti; 417 418 struct dm_table *map; ··· 435 434 if (dm_suspended_md(md)) 436 435 return -EAGAIN; 437 436 438 - r = ti->type->prepare_ioctl(ti, bdev); 439 - if (r == -ENOTCONN && !fatal_signal_pending(current)) { 437 + r = ti->type->prepare_ioctl(ti, bdev, cmd, arg, forward); 438 + if (r == -ENOTCONN && *forward && !fatal_signal_pending(current)) { 440 439 dm_put_live_table(md, *srcu_idx); 441 440 fsleep(10000); 442 441 goto retry; ··· 455 454 { 456 455 struct mapped_device *md = bdev->bd_disk->private_data; 457 456 int r, srcu_idx; 457 + bool forward = true; 458 458 459 - r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 460 - if (r < 0) 459 + r = dm_prepare_ioctl(md, &srcu_idx, &bdev, cmd, arg, &forward); 460 + if (!forward || r < 0) 461 461 goto out; 462 462 463 463 if (r > 0) { ··· 1084 1082 return &md->queue->limits; 1085 1083 } 1086 1084 1087 - void disable_discard(struct mapped_device *md) 1088 - { 1089 - struct queue_limits *limits = dm_get_queue_limits(md); 1090 - 1091 - /* device doesn't really support DISCARD, disable it */ 1092 - limits->max_hw_discard_sectors = 0; 1093 - } 1094 - 1095 - void disable_write_zeroes(struct mapped_device *md) 1096 - { 1097 - struct queue_limits *limits = dm_get_queue_limits(md); 1098 - 1099 - /* device doesn't really support WRITE ZEROES, disable it */ 1100 - limits->max_write_zeroes_sectors = 0; 1101 - } 1102 - 1103 1085 static bool swap_bios_limit(struct dm_target *ti, struct bio *bio) 1104 1086 { 1105 1087 return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios); ··· 1101 1115 if (unlikely(error == BLK_STS_TARGET)) { 1102 1116 if (bio_op(bio) == REQ_OP_DISCARD && 1103 1117 !bdev_max_discard_sectors(bio->bi_bdev)) 1104 - disable_discard(md); 1118 + blk_queue_disable_discard(md->queue); 1105 1119 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && 1106 1120 !bdev_write_zeroes_sectors(bio->bi_bdev)) 1107 - disable_write_zeroes(md); 1121 + blk_queue_disable_write_zeroes(md->queue); 1108 1122 } 1109 1123 1110 1124 if (static_branch_unlikely(&zoned_enabled) && ··· 2407 2421 struct queue_limits *limits) 2408 2422 { 2409 2423 struct dm_table *old_map; 2410 - sector_t size; 2424 + sector_t size, old_size; 2411 2425 int ret; 2412 2426 2413 2427 lockdep_assert_held(&md->suspend_lock); 2414 2428 2415 2429 size = dm_table_get_size(t); 2416 2430 2431 + old_size = dm_get_size(md); 2432 + 2433 + if (!dm_table_supports_size_change(t, old_size, size)) { 2434 + old_map = ERR_PTR(-EINVAL); 2435 + goto out; 2436 + } 2437 + 2438 + set_capacity(md->disk, size); 2439 + 2440 + ret = dm_table_set_restrictions(t, md->queue, limits); 2441 + if (ret) { 2442 + set_capacity(md->disk, old_size); 2443 + old_map = ERR_PTR(ret); 2444 + goto out; 2445 + } 2446 + 2417 2447 /* 2418 2448 * Wipe any geometry if the size of the table changed. 2419 2449 */ 2420 - if (size != dm_get_size(md)) 2450 + if (size != old_size) 2421 2451 memset(&md->geometry, 0, sizeof(md->geometry)); 2422 - 2423 - set_capacity(md->disk, size); 2424 2452 2425 2453 dm_table_event_callback(t, event_callback, md); 2426 2454 ··· 2453 2453 * requests in the queue may refer to bio from the old bioset, 2454 2454 * so you must walk through the queue to unprep. 2455 2455 */ 2456 - if (!md->mempools) { 2456 + if (!md->mempools) 2457 2457 md->mempools = t->mempools; 2458 - t->mempools = NULL; 2459 - } 2458 + else 2459 + dm_free_md_mempools(t->mempools); 2460 2460 } else { 2461 2461 /* 2462 2462 * The md may already have mempools that need changing. ··· 2465 2465 */ 2466 2466 dm_free_md_mempools(md->mempools); 2467 2467 md->mempools = t->mempools; 2468 - t->mempools = NULL; 2469 2468 } 2470 - 2471 - ret = dm_table_set_restrictions(t, md->queue, limits); 2472 - if (ret) { 2473 - old_map = ERR_PTR(ret); 2474 - goto out; 2475 - } 2469 + t->mempools = NULL; 2476 2470 2477 2471 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); 2478 2472 rcu_assign_pointer(md->map, (void *)t); ··· 3632 3638 struct mapped_device *md = bdev->bd_disk->private_data; 3633 3639 const struct pr_ops *ops; 3634 3640 int r, srcu_idx; 3641 + bool forward = true; 3635 3642 3636 - r = dm_prepare_ioctl(md, &srcu_idx, &bdev); 3643 + /* Not a real ioctl, but targets must not interpret non-DM ioctls */ 3644 + r = dm_prepare_ioctl(md, &srcu_idx, &bdev, 0, 0, &forward); 3637 3645 if (r < 0) 3638 3646 goto out; 3647 + WARN_ON_ONCE(!forward); 3639 3648 3640 3649 ops = bdev->bd_disk->fops->pr_ops; 3641 3650 if (ops && ops->pr_clear)

+6

drivers/md/dm.h

··· 58 58 void (*fn)(void *), void *context); 59 59 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); 60 60 bool dm_table_has_no_data_devices(struct dm_table *table); 61 + bool dm_table_is_wildcard(struct dm_table *t); 61 62 int dm_calculate_queue_limits(struct dm_table *table, 62 63 struct queue_limits *limits); 63 64 int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, ··· 73 72 struct dm_target *dm_table_get_immutable_target(struct dm_table *t); 74 73 struct dm_target *dm_table_get_wildcard_target(struct dm_table *t); 75 74 bool dm_table_request_based(struct dm_table *t); 75 + bool dm_table_supports_size_change(struct dm_table *t, sector_t old_size, 76 + sector_t new_size); 76 77 77 78 void dm_lock_md_type(struct mapped_device *md); 78 79 void dm_unlock_md_type(struct mapped_device *md); ··· 105 102 int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, 106 103 struct queue_limits *lim); 107 104 int dm_revalidate_zones(struct dm_table *t, struct request_queue *q); 105 + void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim); 108 106 void dm_zone_endio(struct dm_io *io, struct bio *clone); 109 107 #ifdef CONFIG_BLK_DEV_ZONED 110 108 int dm_blk_report_zones(struct gendisk *disk, sector_t sector, ··· 114 110 int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t, 115 111 sector_t sector, unsigned int nr_zones, 116 112 unsigned long *need_reset); 113 + #define dm_has_zone_plugs(md) ((md)->disk->zone_wplugs_hash != NULL) 117 114 #else 118 115 #define dm_blk_report_zones NULL 119 116 static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) 120 117 { 121 118 return false; 122 119 } 120 + #define dm_has_zone_plugs(md) false 123 121 #endif 124 122 125 123 /*

+7

include/linux/blkdev.h

··· 1456 1456 return bdev_offset_from_zone_start(bdev, sector) == 0; 1457 1457 } 1458 1458 1459 + /* Check whether @sector is a multiple of the zone size. */ 1460 + static inline bool bdev_is_zone_aligned(struct block_device *bdev, 1461 + sector_t sector) 1462 + { 1463 + return bdev_is_zone_start(bdev, sector); 1464 + } 1465 + 1459 1466 /** 1460 1467 * bdev_zone_is_seq - check if a sector belongs to a sequential write zone 1461 1468 * @bdev: block device to check

+8 -1

include/linux/device-mapper.h

··· 93 93 typedef int (*dm_message_fn) (struct dm_target *ti, unsigned int argc, char **argv, 94 94 char *result, unsigned int maxlen); 95 95 96 - typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device **bdev); 96 + /* 97 + * Called with *forward == true. If it remains true, the ioctl should be 98 + * forwarded to bdev. If it is reset to false, the target already fully handled 99 + * the ioctl and the return value is the return value for the whole ioctl. 100 + */ 101 + typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device **bdev, 102 + unsigned int cmd, unsigned long arg, 103 + bool *forward); 97 104 98 105 #ifdef CONFIG_BLK_DEV_ZONED 99 106 typedef int (*dm_report_zones_fn) (struct dm_target *ti,

+7 -2

include/uapi/linux/dm-ioctl.h

··· 258 258 DM_DEV_SET_GEOMETRY_CMD, 259 259 DM_DEV_ARM_POLL_CMD, 260 260 DM_GET_TARGET_VERSION_CMD, 261 + DM_MPATH_PROBE_PATHS_CMD, 261 262 }; 262 263 263 264 #define DM_IOCTL 0xfd 264 265 266 + /* Control device ioctls */ 265 267 #define DM_VERSION _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl) 266 268 #define DM_REMOVE_ALL _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, struct dm_ioctl) 267 269 #define DM_LIST_DEVICES _IOWR(DM_IOCTL, DM_LIST_DEVICES_CMD, struct dm_ioctl) ··· 287 285 #define DM_TARGET_MSG _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl) 288 286 #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) 289 287 288 + /* Block device ioctls */ 289 + #define DM_MPATH_PROBE_PATHS _IO(DM_IOCTL, DM_MPATH_PROBE_PATHS_CMD) 290 + 290 291 #define DM_VERSION_MAJOR 4 291 - #define DM_VERSION_MINOR 49 292 + #define DM_VERSION_MINOR 50 292 293 #define DM_VERSION_PATCHLEVEL 0 293 - #define DM_VERSION_EXTRA "-ioctl (2025-01-17)" 294 + #define DM_VERSION_EXTRA "-ioctl (2025-04-28)" 294 295 295 296 /* Status bits */ 296 297 #define DM_READONLY_FLAG (1 << 0) /* In/Out */

Configure Feed

Configure Feed