Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

fs: report filesystem and file I/O errors to fsnotify

Create some wrapper code around struct super_block so that filesystems
have a standard way to queue filesystem metadata and file I/O error
reports to have them sent to fsnotify.

If a filesystem wants to provide an error number, it must supply only
negative error numbers. These are stored internally as negative
numbers, but they are converted to positive error numbers before being
passed to fanotify, per the fanotify(7) manpage. Implementations of
super_operations::report_error are passed the raw internal event data.

Note that we have to play some shenanigans with mempools and queue_work
so that the error handling doesn't happen outside of process context,
and the event handler functions (both ->report_error and fsnotify) can
handle file I/O error messages without having to worry about whatever
locks might be held. This asynchronicity requires that unmount wait for
pending events to clear.

Add a new callback to the superblock operations structure so that
filesystem drivers can themselves respond to file I/O errors if they so
desire. This will be used for an upcoming self-healing patchset for
XFS.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Link: https://patch.msgid.link/176826402610.3490369.4378391061533403171.stgit@frogsfrogsfrogs
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>

authored by

Darrick J. Wong and committed by
Christian Brauner
21945e6c 60254477

+280 -1
+1 -1
fs/Makefile
··· 16 16 stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ 17 17 fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \ 18 18 kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \ 19 - file_attr.o 19 + file_attr.o fserror.o 20 20 21 21 obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o 22 22 obj-$(CONFIG_PROC_FS) += proc_namespace.o
+194
fs/fserror.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Copyright (c) 2025 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <djwong@kernel.org> 5 + */ 6 + #include <linux/fs.h> 7 + #include <linux/fsnotify.h> 8 + #include <linux/mempool.h> 9 + #include <linux/fserror.h> 10 + 11 + #define FSERROR_DEFAULT_EVENT_POOL_SIZE (32) 12 + 13 + static struct mempool fserror_events_pool; 14 + 15 + void fserror_mount(struct super_block *sb) 16 + { 17 + /* 18 + * The pending error counter is biased by 1 so that we don't wake_var 19 + * until we're actually trying to unmount. 20 + */ 21 + refcount_set(&sb->s_pending_errors, 1); 22 + } 23 + 24 + void fserror_unmount(struct super_block *sb) 25 + { 26 + /* 27 + * If we don't drop the pending error count to zero, then wait for it 28 + * to drop below 1, which means that the pending errors cleared and 29 + * hopefully we didn't saturate with 1 billion+ concurrent events. 30 + */ 31 + if (!refcount_dec_and_test(&sb->s_pending_errors)) 32 + wait_var_event(&sb->s_pending_errors, 33 + refcount_read(&sb->s_pending_errors) < 1); 34 + } 35 + 36 + static inline void fserror_pending_dec(struct super_block *sb) 37 + { 38 + if (refcount_dec_and_test(&sb->s_pending_errors)) 39 + wake_up_var(&sb->s_pending_errors); 40 + } 41 + 42 + static inline void fserror_free_event(struct fserror_event *event) 43 + { 44 + fserror_pending_dec(event->sb); 45 + mempool_free(event, &fserror_events_pool); 46 + } 47 + 48 + static void fserror_worker(struct work_struct *work) 49 + { 50 + struct fserror_event *event = 51 + container_of(work, struct fserror_event, work); 52 + struct super_block *sb = event->sb; 53 + 54 + if (sb->s_flags & SB_ACTIVE) { 55 + struct fs_error_report report = { 56 + /* send positive error number to userspace */ 57 + .error = -event->error, 58 + .inode = event->inode, 59 + .sb = event->sb, 60 + }; 61 + 62 + if (sb->s_op->report_error) 63 + sb->s_op->report_error(event); 64 + 65 + fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, NULL, NULL, 66 + NULL, 0); 67 + } 68 + 69 + iput(event->inode); 70 + fserror_free_event(event); 71 + } 72 + 73 + static inline struct fserror_event *fserror_alloc_event(struct super_block *sb, 74 + gfp_t gfp_flags) 75 + { 76 + struct fserror_event *event = NULL; 77 + 78 + /* 79 + * If pending_errors already reached zero or is no longer active, 80 + * the superblock is being deactivated so there's no point in 81 + * continuing. 82 + * 83 + * The order of the check of s_pending_errors and SB_ACTIVE are 84 + * mandated by order of accesses in generic_shutdown_super and 85 + * fserror_unmount. Barriers are implicitly provided by the refcount 86 + * manipulations in this function and fserror_unmount. 87 + */ 88 + if (!refcount_inc_not_zero(&sb->s_pending_errors)) 89 + return NULL; 90 + if (!(sb->s_flags & SB_ACTIVE)) 91 + goto out_pending; 92 + 93 + event = mempool_alloc(&fserror_events_pool, gfp_flags); 94 + if (!event) 95 + goto out_pending; 96 + 97 + /* mempool_alloc doesn't support GFP_ZERO */ 98 + memset(event, 0, sizeof(*event)); 99 + event->sb = sb; 100 + INIT_WORK(&event->work, fserror_worker); 101 + 102 + return event; 103 + 104 + out_pending: 105 + fserror_pending_dec(sb); 106 + return NULL; 107 + } 108 + 109 + /** 110 + * fserror_report - report a filesystem error of some kind 111 + * 112 + * @sb: superblock of the filesystem 113 + * @inode: inode within that filesystem, if applicable 114 + * @type: type of error encountered 115 + * @pos: start of inode range affected, if applicable 116 + * @len: length of inode range affected, if applicable 117 + * @error: error number encountered, must be negative 118 + * @gfp: memory allocation flags for conveying the event to a worker, 119 + * since this function can be called from atomic contexts 120 + * 121 + * Report details of a filesystem error to the super_operations::report_error 122 + * callback if present; and to fsnotify for distribution to userspace. @sb, 123 + * @gfp, @type, and @error must all be specified. For file I/O errors, the 124 + * @inode, @pos, and @len fields must also be specified. For file metadata 125 + * errors, @inode must be specified. If @inode is not NULL, then @inode->i_sb 126 + * must point to @sb. 127 + * 128 + * Reporting work is deferred to a workqueue to ensure that ->report_error is 129 + * called from process context without any locks held. An active reference to 130 + * the inode is maintained until event handling is complete, and unmount will 131 + * wait for queued events to drain. 132 + */ 133 + void fserror_report(struct super_block *sb, struct inode *inode, 134 + enum fserror_type type, loff_t pos, u64 len, int error, 135 + gfp_t gfp) 136 + { 137 + struct fserror_event *event; 138 + 139 + /* sb and inode must be from the same filesystem */ 140 + WARN_ON_ONCE(inode && inode->i_sb != sb); 141 + 142 + /* error number must be negative */ 143 + WARN_ON_ONCE(error >= 0); 144 + 145 + event = fserror_alloc_event(sb, gfp); 146 + if (!event) 147 + goto lost; 148 + 149 + event->type = type; 150 + event->pos = pos; 151 + event->len = len; 152 + event->error = error; 153 + 154 + /* 155 + * Can't iput from non-sleeping context, so grabbing another reference 156 + * to the inode must be the last thing before submitting the event. 157 + */ 158 + if (inode) { 159 + event->inode = igrab(inode); 160 + if (!event->inode) 161 + goto lost_event; 162 + } 163 + 164 + /* 165 + * Use schedule_work here even if we're already in process context so 166 + * that fsnotify and super_operations::report_error implementations are 167 + * guaranteed to run in process context without any locks held. Since 168 + * errors are supposed to be rare, the overhead shouldn't kill us any 169 + * more than the failing device will. 170 + */ 171 + schedule_work(&event->work); 172 + return; 173 + 174 + lost_event: 175 + fserror_free_event(event); 176 + lost: 177 + if (inode) 178 + pr_err_ratelimited( 179 + "%s: lost file I/O error report for ino %lu type %u pos 0x%llx len 0x%llx error %d", 180 + sb->s_id, inode->i_ino, type, pos, len, error); 181 + else 182 + pr_err_ratelimited( 183 + "%s: lost filesystem error report for type %u error %d", 184 + sb->s_id, type, error); 185 + } 186 + EXPORT_SYMBOL_GPL(fserror_report); 187 + 188 + static int __init fserror_init(void) 189 + { 190 + return mempool_init_kmalloc_pool(&fserror_events_pool, 191 + FSERROR_DEFAULT_EVENT_POOL_SIZE, 192 + sizeof(struct fserror_event)); 193 + } 194 + fs_initcall(fserror_init);
+3
fs/super.c
··· 36 36 #include <linux/lockdep.h> 37 37 #include <linux/user_namespace.h> 38 38 #include <linux/fs_context.h> 39 + #include <linux/fserror.h> 39 40 #include <uapi/linux/mount.h> 40 41 #include "internal.h" 41 42 ··· 364 363 spin_lock_init(&s->s_inode_list_lock); 365 364 INIT_LIST_HEAD(&s->s_inodes_wb); 366 365 spin_lock_init(&s->s_inode_wblist_lock); 366 + fserror_mount(s); 367 367 368 368 s->s_count = 1; 369 369 atomic_set(&s->s_active, 1); ··· 624 622 sync_filesystem(sb); 625 623 sb->s_flags &= ~SB_ACTIVE; 626 624 625 + fserror_unmount(sb); 627 626 cgroup_writeback_umount(sb); 628 627 629 628 /* Evict all inodes with zero refcount. */
+7
include/linux/fs/super_types.h
··· 35 35 struct workqueue_struct; 36 36 struct writeback_control; 37 37 struct xattr_handler; 38 + struct fserror_event; 38 39 39 40 extern struct super_block *blockdev_superblock; 40 41 ··· 125 124 */ 126 125 int (*remove_bdev)(struct super_block *sb, struct block_device *bdev); 127 126 void (*shutdown)(struct super_block *sb); 127 + 128 + /* Report a filesystem error */ 129 + void (*report_error)(const struct fserror_event *event); 128 130 }; 129 131 130 132 struct super_block { ··· 272 268 spinlock_t s_inode_wblist_lock; 273 269 struct list_head s_inodes_wb; /* writeback inodes */ 274 270 long s_min_writeback_pages; 271 + 272 + /* number of fserrors that are being sent to fsnotify/filesystems */ 273 + refcount_t s_pending_errors; 275 274 } __randomize_layout; 276 275 277 276 /*
+75
include/linux/fserror.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Copyright (c) 2025 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <djwong@kernel.org> 5 + */ 6 + #ifndef _LINUX_FSERROR_H__ 7 + #define _LINUX_FSERROR_H__ 8 + 9 + void fserror_mount(struct super_block *sb); 10 + void fserror_unmount(struct super_block *sb); 11 + 12 + enum fserror_type { 13 + /* pagecache I/O failed */ 14 + FSERR_BUFFERED_READ, 15 + FSERR_BUFFERED_WRITE, 16 + 17 + /* direct I/O failed */ 18 + FSERR_DIRECTIO_READ, 19 + FSERR_DIRECTIO_WRITE, 20 + 21 + /* out of band media error reported */ 22 + FSERR_DATA_LOST, 23 + 24 + /* filesystem metadata */ 25 + FSERR_METADATA, 26 + }; 27 + 28 + struct fserror_event { 29 + struct work_struct work; 30 + struct super_block *sb; 31 + struct inode *inode; 32 + loff_t pos; 33 + u64 len; 34 + enum fserror_type type; 35 + 36 + /* negative error number */ 37 + int error; 38 + }; 39 + 40 + void fserror_report(struct super_block *sb, struct inode *inode, 41 + enum fserror_type type, loff_t pos, u64 len, int error, 42 + gfp_t gfp); 43 + 44 + static inline void fserror_report_io(struct inode *inode, 45 + enum fserror_type type, loff_t pos, 46 + u64 len, int error, gfp_t gfp) 47 + { 48 + fserror_report(inode->i_sb, inode, type, pos, len, error, gfp); 49 + } 50 + 51 + static inline void fserror_report_data_lost(struct inode *inode, loff_t pos, 52 + u64 len, gfp_t gfp) 53 + { 54 + fserror_report(inode->i_sb, inode, FSERR_DATA_LOST, pos, len, -EIO, 55 + gfp); 56 + } 57 + 58 + static inline void fserror_report_file_metadata(struct inode *inode, int error, 59 + gfp_t gfp) 60 + { 61 + fserror_report(inode->i_sb, inode, FSERR_METADATA, 0, 0, error, gfp); 62 + } 63 + 64 + static inline void fserror_report_metadata(struct super_block *sb, int error, 65 + gfp_t gfp) 66 + { 67 + fserror_report(sb, NULL, FSERR_METADATA, 0, 0, error, gfp); 68 + } 69 + 70 + static inline void fserror_report_shutdown(struct super_block *sb, gfp_t gfp) 71 + { 72 + fserror_report(sb, NULL, FSERR_METADATA, 0, 0, -ESHUTDOWN, gfp); 73 + } 74 + 75 + #endif /* _LINUX_FSERROR_H__ */