Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'block-6.19-20260102' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull block fixes from Jens Axboe:

- Scan partition tables asynchronously for ublk, similarly to how nvme
does it. This avoids potential deadlocks, which is why nvme does it
that way too. Includes a set of selftests as well.

- MD pull request via Yu:
- Fix null-pointer dereference in raid5 sysfs group_thread_cnt
store (Tuo Li)
- Fix possible mempool corruption during raid1 raid_disks update
via sysfs (FengWei Shih)
- Fix logical_block_size configuration being overwritten during
super_1_validate() (Li Nan)
- Fix forward incompatibility with configurable logical block size:
arrays assembled on new kernels could not be assembled on older
kernels (v6.18 and before) due to non-zero reserved pad rejection
(Li Nan)
- Fix static checker warning about iterator not incremented (Li Nan)

- Skip CPU offlining notifications on unmapped hardware queues

- bfq-iosched block stats fix

- Fix outdated comment in bfq-iosched

* tag 'block-6.19-20260102' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux:
block, bfq: update outdated comment
blk-mq: skip CPU offline notify on unmapped hctx
selftests/ublk: fix Makefile to rebuild on header changes
selftests/ublk: add test for async partition scan
ublk: scan partition in async way
block,bfq: fix aux stat accumulation destination
md: Fix forward incompatibility from configurable logical block size
md: Fix logical_block_size configuration being overwritten
md: suspend array while updating raid_disks via sysfs
md/raid5: fix possible null-pointer dereferences in raid5_store_group_thread_cnt()
md: Fix static checker warning in analyze_sbs

+174 -27
+1 -1
block/bfq-cgroup.c
··· 380 380 blkg_rwstat_add_aux(&to->merged, &from->merged); 381 381 blkg_rwstat_add_aux(&to->service_time, &from->service_time); 382 382 blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); 383 - bfq_stat_add_aux(&from->time, &from->time); 383 + bfq_stat_add_aux(&to->time, &from->time); 384 384 bfq_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); 385 385 bfq_stat_add_aux(&to->avg_queue_size_samples, 386 386 &from->avg_queue_size_samples);
+1 -1
block/bfq-iosched.h
··· 984 984 * unused for the root group. Used to know whether there 985 985 * are groups with more than one active @bfq_entity 986 986 * (see the comments to the function 987 - * bfq_bfqq_may_idle()). 987 + * bfq_better_to_idle()). 988 988 * @rq_pos_tree: rbtree sorted by next_request position, used when 989 989 * determining if two or more queues have interleaving 990 990 * requests (see bfq_find_close_cooperator()).
+1 -1
block/blk-mq.c
··· 3721 3721 struct blk_mq_hw_ctx, cpuhp_online); 3722 3722 int ret = 0; 3723 3723 3724 - if (blk_mq_hctx_has_online_cpu(hctx, cpu)) 3724 + if (!hctx->nr_ctx || blk_mq_hctx_has_online_cpu(hctx, cpu)) 3725 3725 return 0; 3726 3726 3727 3727 /*
+32 -3
drivers/block/ublk_drv.c
··· 237 237 bool canceling; 238 238 pid_t ublksrv_tgid; 239 239 struct delayed_work exit_work; 240 + struct work_struct partition_scan_work; 240 241 241 242 struct ublk_queue *queues[]; 242 243 }; ··· 254 253 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, 255 254 u16 q_id, u16 tag, struct ublk_io *io, size_t offset); 256 255 static inline unsigned int ublk_req_build_flags(struct request *req); 256 + 257 + static void ublk_partition_scan_work(struct work_struct *work) 258 + { 259 + struct ublk_device *ub = 260 + container_of(work, struct ublk_device, partition_scan_work); 261 + 262 + if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN, 263 + &ub->ub_disk->state))) 264 + return; 265 + 266 + mutex_lock(&ub->ub_disk->open_mutex); 267 + bdev_disk_changed(ub->ub_disk, false); 268 + mutex_unlock(&ub->ub_disk->open_mutex); 269 + } 257 270 258 271 static inline struct ublksrv_io_desc * 259 272 ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) ··· 2041 2026 mutex_lock(&ub->mutex); 2042 2027 ublk_stop_dev_unlocked(ub); 2043 2028 mutex_unlock(&ub->mutex); 2029 + flush_work(&ub->partition_scan_work); 2044 2030 ublk_cancel_dev(ub); 2045 2031 } 2046 2032 ··· 2970 2954 2971 2955 ublk_apply_params(ub); 2972 2956 2973 - /* don't probe partitions if any daemon task is un-trusted */ 2974 - if (ub->unprivileged_daemons) 2975 - set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); 2957 + /* 2958 + * Suppress partition scan to avoid potential IO hang. 2959 + * 2960 + * If ublk server error occurs during partition scan, the IO may 2961 + * wait while holding ub->mutex, which can deadlock with other 2962 + * operations that need the mutex. Defer partition scan to async 2963 + * work. 2964 + * For unprivileged daemons, keep GD_SUPPRESS_PART_SCAN set 2965 + * permanently. 2966 + */ 2967 + set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); 2976 2968 2977 2969 ublk_get_device(ub); 2978 2970 ub->dev_info.state = UBLK_S_DEV_LIVE; ··· 2996 2972 goto out_put_cdev; 2997 2973 2998 2974 set_bit(UB_STATE_USED, &ub->state); 2975 + 2976 + /* Schedule async partition scan for trusted daemons */ 2977 + if (!ub->unprivileged_daemons) 2978 + schedule_work(&ub->partition_scan_work); 2999 2979 3000 2980 out_put_cdev: 3001 2981 if (ret) { ··· 3166 3138 mutex_init(&ub->mutex); 3167 3139 spin_lock_init(&ub->lock); 3168 3140 mutex_init(&ub->cancel_mutex); 3141 + INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work); 3169 3142 3170 3143 ret = ublk_alloc_dev_number(ub, header->dev_id); 3171 3144 if (ret < 0)
+50 -11
drivers/md/md.c
··· 1999 1999 mddev->layout = le32_to_cpu(sb->layout); 2000 2000 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 2001 2001 mddev->dev_sectors = le64_to_cpu(sb->size); 2002 - mddev->logical_block_size = le32_to_cpu(sb->logical_block_size); 2003 2002 mddev->events = ev1; 2004 2003 mddev->bitmap_info.offset = 0; 2005 2004 mddev->bitmap_info.space = 0; ··· 2013 2014 memcpy(mddev->uuid, sb->set_uuid, 16); 2014 2015 2015 2016 mddev->max_disks = (4096-256)/2; 2017 + 2018 + if (!mddev->logical_block_size) 2019 + mddev->logical_block_size = le32_to_cpu(sb->logical_block_size); 2016 2020 2017 2021 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 2018 2022 mddev->bitmap_info.file == NULL) { ··· 3884 3882 3885 3883 static int analyze_sbs(struct mddev *mddev) 3886 3884 { 3887 - int i; 3888 3885 struct md_rdev *rdev, *freshest, *tmp; 3889 3886 3890 3887 freshest = NULL; ··· 3910 3909 super_types[mddev->major_version]. 3911 3910 validate_super(mddev, NULL/*freshest*/, freshest); 3912 3911 3913 - i = 0; 3914 3912 rdev_for_each_safe(rdev, tmp, mddev) { 3915 3913 if (mddev->max_disks && 3916 - (rdev->desc_nr >= mddev->max_disks || 3917 - i > mddev->max_disks)) { 3914 + rdev->desc_nr >= mddev->max_disks) { 3918 3915 pr_warn("md: %s: %pg: only %d devices permitted\n", 3919 3916 mdname(mddev), rdev->bdev, 3920 3917 mddev->max_disks); ··· 4406 4407 if (err < 0) 4407 4408 return err; 4408 4409 4409 - err = mddev_lock(mddev); 4410 + err = mddev_suspend_and_lock(mddev); 4410 4411 if (err) 4411 4412 return err; 4412 4413 if (mddev->pers) ··· 4431 4432 } else 4432 4433 mddev->raid_disks = n; 4433 4434 out_unlock: 4434 - mddev_unlock(mddev); 4435 + mddev_unlock_and_resume(mddev); 4435 4436 return err ? err : len; 4436 4437 } 4437 4438 static struct md_sysfs_entry md_raid_disks = ··· 5980 5981 if (mddev->major_version == 0) 5981 5982 return -EINVAL; 5982 5983 5983 - if (mddev->pers) 5984 - return -EBUSY; 5985 - 5986 5984 err = kstrtouint(buf, 10, &lbs); 5987 5985 if (err < 0) 5988 5986 return -EINVAL; 5987 + 5988 + if (mddev->pers) { 5989 + unsigned int curr_lbs; 5990 + 5991 + if (mddev->logical_block_size) 5992 + return -EBUSY; 5993 + /* 5994 + * To fix forward compatibility issues, LBS is not 5995 + * configured for arrays from old kernels (<=6.18) by default. 5996 + * If the user confirms no rollback to old kernels, 5997 + * enable LBS by writing current LBS — to prevent data 5998 + * loss from LBS changes. 5999 + */ 6000 + curr_lbs = queue_logical_block_size(mddev->gendisk->queue); 6001 + if (lbs != curr_lbs) 6002 + return -EINVAL; 6003 + 6004 + mddev->logical_block_size = curr_lbs; 6005 + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6006 + pr_info("%s: logical block size configured successfully, array will not be assembled in old kernels (<= 6.18)\n", 6007 + mdname(mddev)); 6008 + return len; 6009 + } 5989 6010 5990 6011 err = mddev_lock(mddev); 5991 6012 if (err) ··· 6182 6163 mdname(mddev)); 6183 6164 return -EINVAL; 6184 6165 } 6185 - mddev->logical_block_size = lim->logical_block_size; 6166 + 6167 + /* Only 1.x meta needs to set logical block size */ 6168 + if (mddev->major_version == 0) 6169 + return 0; 6170 + 6171 + /* 6172 + * Fix forward compatibility issue. Only set LBS by default for 6173 + * new arrays, mddev->events == 0 indicates the array was just 6174 + * created. When assembling an array, read LBS from the superblock 6175 + * instead — LBS is 0 in superblocks created by old kernels. 6176 + */ 6177 + if (!mddev->events) { 6178 + pr_info("%s: array will not be assembled in old kernels that lack configurable LBS support (<= 6.18)\n", 6179 + mdname(mddev)); 6180 + mddev->logical_block_size = lim->logical_block_size; 6181 + } 6182 + 6183 + if (!mddev->logical_block_size) 6184 + pr_warn("%s: echo current LBS to md/logical_block_size to prevent data loss issues from LBS changes.\n" 6185 + "\tNote: After setting, array will not be assembled in old kernels (<= 6.18)\n", 6186 + mdname(mddev)); 6186 6187 6187 6188 return 0; 6188 6189 }
+6 -4
drivers/md/raid5.c
··· 7187 7187 err = mddev_suspend_and_lock(mddev); 7188 7188 if (err) 7189 7189 return err; 7190 + conf = mddev->private; 7191 + if (!conf) { 7192 + mddev_unlock_and_resume(mddev); 7193 + return -ENODEV; 7194 + } 7190 7195 raid5_quiesce(mddev, true); 7191 7196 7192 - conf = mddev->private; 7193 - if (!conf) 7194 - err = -ENODEV; 7195 - else if (new != conf->worker_cnt_per_group) { 7197 + if (new != conf->worker_cnt_per_group) { 7196 7198 old_groups = conf->worker_groups; 7197 7199 if (old_groups) 7198 7200 flush_workqueue(raid5_wq);
+3 -2
tools/testing/selftests/ublk/Makefile
··· 22 22 TEST_PROGS += test_generic_12.sh 23 23 TEST_PROGS += test_generic_13.sh 24 24 TEST_PROGS += test_generic_14.sh 25 + TEST_PROGS += test_generic_15.sh 25 26 26 27 TEST_PROGS += test_null_01.sh 27 28 TEST_PROGS += test_null_02.sh ··· 51 50 52 51 TEST_GEN_PROGS_EXTENDED = kublk 53 52 53 + LOCAL_HDRS += $(wildcard *.h) 54 54 include ../lib.mk 55 55 56 - $(TEST_GEN_PROGS_EXTENDED): kublk.c null.c file_backed.c common.c stripe.c \ 57 - fault_inject.c 56 + $(TEST_GEN_PROGS_EXTENDED): $(wildcard *.c) 58 57 59 58 check: 60 59 shellcheck -x -f gcc *.sh
+12 -4
tools/testing/selftests/ublk/test_common.sh
··· 178 178 _create_ublk_dev() { 179 179 local dev_id; 180 180 local cmd=$1 181 + local settle=$2 181 182 182 - shift 1 183 + shift 2 183 184 184 185 if [ ! -c /dev/ublk-control ]; then 185 186 return ${UBLK_SKIP_CODE} ··· 195 194 echo "fail to add ublk dev $*" 196 195 return 255 197 196 fi 198 - udevadm settle 197 + 198 + if [ "$settle" = "yes" ]; then 199 + udevadm settle 200 + fi 199 201 200 202 if [[ "$dev_id" =~ ^[0-9]+$ ]]; then 201 203 echo "${dev_id}" ··· 208 204 } 209 205 210 206 _add_ublk_dev() { 211 - _create_ublk_dev "add" "$@" 207 + _create_ublk_dev "add" "yes" "$@" 208 + } 209 + 210 + _add_ublk_dev_no_settle() { 211 + _create_ublk_dev "add" "no" "$@" 212 212 } 213 213 214 214 _recover_ublk_dev() { 215 215 local dev_id 216 216 local state 217 217 218 - dev_id=$(_create_ublk_dev "recover" "$@") 218 + dev_id=$(_create_ublk_dev "recover" "yes" "$@") 219 219 for ((j=0;j<20;j++)); do 220 220 state=$(_get_ublk_dev_state "${dev_id}") 221 221 [ "$state" == "LIVE" ] && break
+68
tools/testing/selftests/ublk/test_generic_15.sh
··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh 5 + 6 + TID="generic_15" 7 + ERR_CODE=0 8 + 9 + _test_partition_scan_no_hang() 10 + { 11 + local recovery_flag=$1 12 + local expected_state=$2 13 + local dev_id 14 + local state 15 + local daemon_pid 16 + local start_time 17 + local elapsed 18 + 19 + # Create ublk device with fault_inject target and very large delay 20 + # to simulate hang during partition table read 21 + # --delay_us 60000000 = 60 seconds delay 22 + # Use _add_ublk_dev_no_settle to avoid udevadm settle hang waiting 23 + # for partition scan events to complete 24 + if [ "$recovery_flag" = "yes" ]; then 25 + echo "Testing partition scan with recovery support..." 26 + dev_id=$(_add_ublk_dev_no_settle -t fault_inject -q 1 -d 1 --delay_us 60000000 -r 1) 27 + else 28 + echo "Testing partition scan without recovery..." 29 + dev_id=$(_add_ublk_dev_no_settle -t fault_inject -q 1 -d 1 --delay_us 60000000) 30 + fi 31 + 32 + _check_add_dev "$TID" $? 33 + 34 + # The add command should return quickly because partition scan is async. 35 + # Now sleep briefly to let the async partition scan work start and hit 36 + # the delay in the fault_inject handler. 37 + sleep 1 38 + 39 + # Kill the ublk daemon while partition scan is potentially blocked 40 + # And check state transitions properly 41 + start_time=${SECONDS} 42 + daemon_pid=$(_get_ublk_daemon_pid "${dev_id}") 43 + state=$(__ublk_kill_daemon "${dev_id}" "${expected_state}") 44 + elapsed=$((SECONDS - start_time)) 45 + 46 + # Verify the device transitioned to expected state 47 + if [ "$state" != "${expected_state}" ]; then 48 + echo "FAIL: Device state is $state, expected ${expected_state}" 49 + ERR_CODE=255 50 + ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 51 + return 52 + fi 53 + echo "PASS: Device transitioned to ${expected_state} in ${elapsed}s without hanging" 54 + 55 + # Clean up the device 56 + ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 57 + } 58 + 59 + _prep_test "partition_scan" "verify async partition scan prevents IO hang" 60 + 61 + # Test 1: Without recovery support - should transition to DEAD 62 + _test_partition_scan_no_hang "no" "DEAD" 63 + 64 + # Test 2: With recovery support - should transition to QUIESCED 65 + _test_partition_scan_no_hang "yes" "QUIESCED" 66 + 67 + _cleanup_test "partition_scan" 68 + _show_result $TID $ERR_CODE