Merge tag 'health-monitoring-7.0_2026-01-20' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into xfs-7.0-merge

+2

arch/alpha/include/uapi/asm/errno.h

··· 55 55 #define ENOSR 82 /* Out of streams resources */ 56 56 #define ETIME 83 /* Timer expired */ 57 57 #define EBADMSG 84 /* Not a data message */ 58 + #define EFSBADCRC EBADMSG /* Bad CRC detected */ 58 59 #define EPROTO 85 /* Protocol error */ 59 60 #define ENODATA 86 /* No data available */ 60 61 #define ENOSTR 87 /* Device not a stream */ ··· 97 96 #define EREMCHG 115 /* Remote address changed */ 98 97 99 98 #define EUCLEAN 117 /* Structure needs cleaning */ 99 + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 100 100 #define ENOTNAM 118 /* Not a XENIX named type file */ 101 101 #define ENAVAIL 119 /* No XENIX semaphores available */ 102 102 #define EISNAM 120 /* Is a named type file */

+2

arch/mips/include/uapi/asm/errno.h

··· 50 50 #define EDOTDOT 73 /* RFS specific error */ 51 51 #define EMULTIHOP 74 /* Multihop attempted */ 52 52 #define EBADMSG 77 /* Not a data message */ 53 + #define EFSBADCRC EBADMSG /* Bad CRC detected */ 53 54 #define ENAMETOOLONG 78 /* File name too long */ 54 55 #define EOVERFLOW 79 /* Value too large for defined data type */ 55 56 #define ENOTUNIQ 80 /* Name not unique on network */ ··· 89 88 #define EISCONN 133 /* Transport endpoint is already connected */ 90 89 #define ENOTCONN 134 /* Transport endpoint is not connected */ 91 90 #define EUCLEAN 135 /* Structure needs cleaning */ 91 + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 92 92 #define ENOTNAM 137 /* Not a XENIX named type file */ 93 93 #define ENAVAIL 138 /* No XENIX semaphores available */ 94 94 #define EISNAM 139 /* Is a named type file */

+2

arch/parisc/include/uapi/asm/errno.h

··· 36 36 37 37 #define EDOTDOT 66 /* RFS specific error */ 38 38 #define EBADMSG 67 /* Not a data message */ 39 + #define EFSBADCRC EBADMSG /* Bad CRC detected */ 39 40 #define EUSERS 68 /* Too many users */ 40 41 #define EDQUOT 69 /* Quota exceeded */ 41 42 #define ESTALE 70 /* Stale file handle */ ··· 63 62 #define ERESTART 175 /* Interrupted system call should be restarted */ 64 63 #define ESTRPIPE 176 /* Streams pipe error */ 65 64 #define EUCLEAN 177 /* Structure needs cleaning */ 65 + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 66 66 #define ENOTNAM 178 /* Not a XENIX named type file */ 67 67 #define ENAVAIL 179 /* No XENIX semaphores available */ 68 68 #define EISNAM 180 /* Is a named type file */

+2

arch/sparc/include/uapi/asm/errno.h

··· 48 48 #define ENOSR 74 /* Out of streams resources */ 49 49 #define ENOMSG 75 /* No message of desired type */ 50 50 #define EBADMSG 76 /* Not a data message */ 51 + #define EFSBADCRC EBADMSG /* Bad CRC detected */ 51 52 #define EIDRM 77 /* Identifier removed */ 52 53 #define EDEADLK 78 /* Resource deadlock would occur */ 53 54 #define ENOLCK 79 /* No record locks available */ ··· 92 91 #define ENOTUNIQ 115 /* Name not unique on network */ 93 92 #define ERESTART 116 /* Interrupted syscall should be restarted */ 94 93 #define EUCLEAN 117 /* Structure needs cleaning */ 94 + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 95 95 #define ENOTNAM 118 /* Not a XENIX named type file */ 96 96 #define ENAVAIL 119 /* No XENIX semaphores available */ 97 97 #define EISNAM 120 /* Is a named type file */

+1 -1

fs/Makefile

··· 16 16 stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ 17 17 fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \ 18 18 kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \ 19 - file_attr.o 19 + file_attr.o fserror.o 20 20 21 21 obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o 22 22 obj-$(CONFIG_PROC_FS) += proc_namespace.o

-2

fs/erofs/internal.h

··· 541 541 long erofs_compat_ioctl(struct file *filp, unsigned int cmd, 542 542 unsigned long arg); 543 543 544 - #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 545 - 546 544 #endif /* __EROFS_INTERNAL_H */

-1

fs/ext2/ext2.h

··· 357 357 */ 358 358 #define EXT2_VALID_FS 0x0001 /* Unmounted cleanly */ 359 359 #define EXT2_ERROR_FS 0x0002 /* Errors detected */ 360 - #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 361 360 362 361 /* 363 362 * Mount flags

-3

fs/ext4/ext4.h

··· 3938 3938 get_block_t *get_block); 3939 3939 #endif /* __KERNEL__ */ 3940 3940 3941 - #define EFSBADCRC EBADMSG /* Bad CRC detected */ 3942 - #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 3943 - 3944 3941 #endif /* _EXT4_H */

+2

fs/ext4/ioctl.c

··· 26 26 #include <linux/fsmap.h> 27 27 #include "fsmap.h" 28 28 #include <trace/events/ext4.h> 29 + #include <linux/fserror.h> 29 30 30 31 typedef void ext4_update_sb_callback(struct ext4_sb_info *sbi, 31 32 struct ext4_super_block *es, ··· 845 844 return -EINVAL; 846 845 } 847 846 clear_opt(sb, DISCARD); 847 + fserror_report_shutdown(sb, GFP_KERNEL); 848 848 return 0; 849 849 } 850 850

+9 -4

fs/ext4/super.c

··· 48 48 #include <linux/fsnotify.h> 49 49 #include <linux/fs_context.h> 50 50 #include <linux/fs_parser.h> 51 + #include <linux/fserror.h> 51 52 52 53 #include "ext4.h" 53 54 #include "ext4_extents.h" /* Needed for trace points definition */ ··· 825 824 sb->s_id, function, line, current->comm, &vaf); 826 825 va_end(args); 827 826 } 828 - fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED); 827 + fserror_report_metadata(sb, error ? -abs(error) : -EFSCORRUPTED, 828 + GFP_ATOMIC); 829 829 830 830 ext4_handle_error(sb, force_ro, error, 0, block, function, line); 831 831 } ··· 858 856 current->comm, &vaf); 859 857 va_end(args); 860 858 } 861 - fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED); 859 + fserror_report_file_metadata(inode, 860 + error ? -abs(error) : -EFSCORRUPTED, 861 + GFP_ATOMIC); 862 862 863 863 ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block, 864 864 function, line); ··· 900 896 current->comm, path, &vaf); 901 897 va_end(args); 902 898 } 903 - fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED); 899 + fserror_report_file_metadata(inode, -EFSCORRUPTED, GFP_ATOMIC); 904 900 905 901 ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block, 906 902 function, line); ··· 969 965 printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", 970 966 sb->s_id, function, line, errstr); 971 967 } 972 - fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED); 968 + fserror_report_metadata(sb, errno ? -abs(errno) : -EFSCORRUPTED, 969 + GFP_ATOMIC); 973 970 974 971 ext4_handle_error(sb, false, -errno, 0, 0, function, line); 975 972 }

-3

fs/f2fs/f2fs.h

··· 5004 5004 f2fs_invalidate_compress_pages_range(sbi, blkaddr, len); 5005 5005 } 5006 5006 5007 - #define EFSBADCRC EBADMSG /* Bad CRC detected */ 5008 - #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 5009 - 5010 5007 #endif /* _LINUX_F2FS_H */

+194

fs/fserror.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Copyright (c) 2025 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <djwong@kernel.org> 5 + */ 6 + #include <linux/fs.h> 7 + #include <linux/fsnotify.h> 8 + #include <linux/mempool.h> 9 + #include <linux/fserror.h> 10 + 11 + #define FSERROR_DEFAULT_EVENT_POOL_SIZE (32) 12 + 13 + static struct mempool fserror_events_pool; 14 + 15 + void fserror_mount(struct super_block *sb) 16 + { 17 + /* 18 + * The pending error counter is biased by 1 so that we don't wake_var 19 + * until we're actually trying to unmount. 20 + */ 21 + refcount_set(&sb->s_pending_errors, 1); 22 + } 23 + 24 + void fserror_unmount(struct super_block *sb) 25 + { 26 + /* 27 + * If we don't drop the pending error count to zero, then wait for it 28 + * to drop below 1, which means that the pending errors cleared and 29 + * hopefully we didn't saturate with 1 billion+ concurrent events. 30 + */ 31 + if (!refcount_dec_and_test(&sb->s_pending_errors)) 32 + wait_var_event(&sb->s_pending_errors, 33 + refcount_read(&sb->s_pending_errors) < 1); 34 + } 35 + 36 + static inline void fserror_pending_dec(struct super_block *sb) 37 + { 38 + if (refcount_dec_and_test(&sb->s_pending_errors)) 39 + wake_up_var(&sb->s_pending_errors); 40 + } 41 + 42 + static inline void fserror_free_event(struct fserror_event *event) 43 + { 44 + fserror_pending_dec(event->sb); 45 + mempool_free(event, &fserror_events_pool); 46 + } 47 + 48 + static void fserror_worker(struct work_struct *work) 49 + { 50 + struct fserror_event *event = 51 + container_of(work, struct fserror_event, work); 52 + struct super_block *sb = event->sb; 53 + 54 + if (sb->s_flags & SB_ACTIVE) { 55 + struct fs_error_report report = { 56 + /* send positive error number to userspace */ 57 + .error = -event->error, 58 + .inode = event->inode, 59 + .sb = event->sb, 60 + }; 61 + 62 + if (sb->s_op->report_error) 63 + sb->s_op->report_error(event); 64 + 65 + fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, NULL, NULL, 66 + NULL, 0); 67 + } 68 + 69 + iput(event->inode); 70 + fserror_free_event(event); 71 + } 72 + 73 + static inline struct fserror_event *fserror_alloc_event(struct super_block *sb, 74 + gfp_t gfp_flags) 75 + { 76 + struct fserror_event *event = NULL; 77 + 78 + /* 79 + * If pending_errors already reached zero or is no longer active, 80 + * the superblock is being deactivated so there's no point in 81 + * continuing. 82 + * 83 + * The order of the check of s_pending_errors and SB_ACTIVE are 84 + * mandated by order of accesses in generic_shutdown_super and 85 + * fserror_unmount. Barriers are implicitly provided by the refcount 86 + * manipulations in this function and fserror_unmount. 87 + */ 88 + if (!refcount_inc_not_zero(&sb->s_pending_errors)) 89 + return NULL; 90 + if (!(sb->s_flags & SB_ACTIVE)) 91 + goto out_pending; 92 + 93 + event = mempool_alloc(&fserror_events_pool, gfp_flags); 94 + if (!event) 95 + goto out_pending; 96 + 97 + /* mempool_alloc doesn't support GFP_ZERO */ 98 + memset(event, 0, sizeof(*event)); 99 + event->sb = sb; 100 + INIT_WORK(&event->work, fserror_worker); 101 + 102 + return event; 103 + 104 + out_pending: 105 + fserror_pending_dec(sb); 106 + return NULL; 107 + } 108 + 109 + /** 110 + * fserror_report - report a filesystem error of some kind 111 + * 112 + * @sb: superblock of the filesystem 113 + * @inode: inode within that filesystem, if applicable 114 + * @type: type of error encountered 115 + * @pos: start of inode range affected, if applicable 116 + * @len: length of inode range affected, if applicable 117 + * @error: error number encountered, must be negative 118 + * @gfp: memory allocation flags for conveying the event to a worker, 119 + * since this function can be called from atomic contexts 120 + * 121 + * Report details of a filesystem error to the super_operations::report_error 122 + * callback if present; and to fsnotify for distribution to userspace. @sb, 123 + * @gfp, @type, and @error must all be specified. For file I/O errors, the 124 + * @inode, @pos, and @len fields must also be specified. For file metadata 125 + * errors, @inode must be specified. If @inode is not NULL, then @inode->i_sb 126 + * must point to @sb. 127 + * 128 + * Reporting work is deferred to a workqueue to ensure that ->report_error is 129 + * called from process context without any locks held. An active reference to 130 + * the inode is maintained until event handling is complete, and unmount will 131 + * wait for queued events to drain. 132 + */ 133 + void fserror_report(struct super_block *sb, struct inode *inode, 134 + enum fserror_type type, loff_t pos, u64 len, int error, 135 + gfp_t gfp) 136 + { 137 + struct fserror_event *event; 138 + 139 + /* sb and inode must be from the same filesystem */ 140 + WARN_ON_ONCE(inode && inode->i_sb != sb); 141 + 142 + /* error number must be negative */ 143 + WARN_ON_ONCE(error >= 0); 144 + 145 + event = fserror_alloc_event(sb, gfp); 146 + if (!event) 147 + goto lost; 148 + 149 + event->type = type; 150 + event->pos = pos; 151 + event->len = len; 152 + event->error = error; 153 + 154 + /* 155 + * Can't iput from non-sleeping context, so grabbing another reference 156 + * to the inode must be the last thing before submitting the event. 157 + */ 158 + if (inode) { 159 + event->inode = igrab(inode); 160 + if (!event->inode) 161 + goto lost_event; 162 + } 163 + 164 + /* 165 + * Use schedule_work here even if we're already in process context so 166 + * that fsnotify and super_operations::report_error implementations are 167 + * guaranteed to run in process context without any locks held. Since 168 + * errors are supposed to be rare, the overhead shouldn't kill us any 169 + * more than the failing device will. 170 + */ 171 + schedule_work(&event->work); 172 + return; 173 + 174 + lost_event: 175 + fserror_free_event(event); 176 + lost: 177 + if (inode) 178 + pr_err_ratelimited( 179 + "%s: lost file I/O error report for ino %lu type %u pos 0x%llx len 0x%llx error %d", 180 + sb->s_id, inode->i_ino, type, pos, len, error); 181 + else 182 + pr_err_ratelimited( 183 + "%s: lost filesystem error report for type %u error %d", 184 + sb->s_id, type, error); 185 + } 186 + EXPORT_SYMBOL_GPL(fserror_report); 187 + 188 + static int __init fserror_init(void) 189 + { 190 + return mempool_init_kmalloc_pool(&fserror_events_pool, 191 + FSERROR_DEFAULT_EVENT_POOL_SIZE, 192 + sizeof(struct fserror_event)); 193 + } 194 + fs_initcall(fserror_init);

+22 -1

fs/iomap/buffered-io.c

··· 8 8 #include <linux/writeback.h> 9 9 #include <linux/swap.h> 10 10 #include <linux/migrate.h> 11 + #include <linux/fserror.h> 11 12 #include "internal.h" 12 13 #include "trace.h" 13 14 ··· 372 371 if (folio_test_uptodate(folio)) 373 372 return 0; 374 373 375 - if (WARN_ON_ONCE(size > iomap->length)) 374 + if (WARN_ON_ONCE(size > iomap->length)) { 375 + fserror_report_io(iter->inode, FSERR_BUFFERED_READ, 376 + iomap->offset, size, -EIO, GFP_NOFS); 376 377 return -EIO; 378 + } 377 379 if (offset > 0) 378 380 ifs_alloc(iter->inode, folio, iter->flags); 379 381 ··· 402 398 finished = !ifs->read_bytes_pending; 403 399 spin_unlock_irqrestore(&ifs->state_lock, flags); 404 400 } 401 + 402 + if (error) 403 + fserror_report_io(folio->mapping->host, FSERR_BUFFERED_READ, 404 + folio_pos(folio) + off, len, error, 405 + GFP_ATOMIC); 405 406 406 407 if (finished) 407 408 folio_end_read(folio, uptodate); ··· 549 540 if (!*bytes_submitted) 550 541 iomap_read_init(folio); 551 542 ret = ctx->ops->read_folio_range(iter, ctx, plen); 543 + if (ret < 0) 544 + fserror_report_io(iter->inode, 545 + FSERR_BUFFERED_READ, pos, 546 + plen, ret, GFP_NOFS); 552 547 if (ret) 553 548 return ret; 554 549 *bytes_submitted += plen; ··· 828 815 else 829 816 status = iomap_bio_read_folio_range_sync(iter, 830 817 folio, block_start, plen); 818 + if (status < 0) 819 + fserror_report_io(iter->inode, 820 + FSERR_BUFFERED_READ, pos, 821 + len, status, GFP_NOFS); 831 822 if (status) 832 823 return status; 833 824 } ··· 1842 1825 u64 pos = folio_pos(folio); 1843 1826 u64 end_pos = pos + folio_size(folio); 1844 1827 u64 end_aligned = 0; 1828 + loff_t orig_pos = pos; 1845 1829 size_t bytes_submitted = 0; 1846 1830 int error = 0; 1847 1831 u32 rlen; ··· 1886 1868 1887 1869 if (bytes_submitted) 1888 1870 wpc->nr_folios++; 1871 + if (error && pos > orig_pos) 1872 + fserror_report_io(inode, FSERR_BUFFERED_WRITE, orig_pos, 0, 1873 + error, GFP_NOFS); 1889 1874 1890 1875 /* 1891 1876 * We can have dirty bits set past end of file in page_mkwrite path

+12

fs/iomap/direct-io.c

··· 7 7 #include <linux/pagemap.h> 8 8 #include <linux/iomap.h> 9 9 #include <linux/task_io_accounting_ops.h> 10 + #include <linux/fserror.h> 10 11 #include "internal.h" 11 12 #include "trace.h" 12 13 ··· 79 78 } 80 79 } 81 80 81 + static inline enum fserror_type iomap_dio_err_type(const struct iomap_dio *dio) 82 + { 83 + if (dio->flags & IOMAP_DIO_WRITE) 84 + return FSERR_DIRECTIO_WRITE; 85 + return FSERR_DIRECTIO_READ; 86 + } 87 + 82 88 ssize_t iomap_dio_complete(struct iomap_dio *dio) 83 89 { 84 90 const struct iomap_dio_ops *dops = dio->dops; ··· 95 87 96 88 if (dops && dops->end_io) 97 89 ret = dops->end_io(iocb, dio->size, ret, dio->flags); 90 + if (dio->error) 91 + fserror_report_io(file_inode(iocb->ki_filp), 92 + iomap_dio_err_type(dio), offset, dio->size, 93 + dio->error, GFP_NOFS); 98 94 99 95 if (likely(!ret)) { 100 96 ret = dio->size;

+6

fs/iomap/ioend.c

··· 6 6 #include <linux/list_sort.h> 7 7 #include <linux/pagemap.h> 8 8 #include <linux/writeback.h> 9 + #include <linux/fserror.h> 9 10 #include "internal.h" 10 11 #include "trace.h" 11 12 ··· 56 55 57 56 /* walk all folios in bio, ending page IO on them */ 58 57 bio_for_each_folio_all(fi, bio) { 58 + if (ioend->io_error) 59 + fserror_report_io(inode, FSERR_BUFFERED_WRITE, 60 + folio_pos(fi.folio) + fi.offset, 61 + fi.length, ioend->io_error, 62 + GFP_ATOMIC); 59 63 iomap_finish_folio_write(inode, fi.folio, fi.length); 60 64 folio_count++; 61 65 }

-2

fs/minix/minix.h

··· 175 175 __minix_error_inode((inode), __func__, __LINE__, \ 176 176 (fmt), ##__VA_ARGS__) 177 177 178 - #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 179 - 180 178 #endif /* FS_MINIX_H */

+3

fs/super.c

··· 36 36 #include <linux/lockdep.h> 37 37 #include <linux/user_namespace.h> 38 38 #include <linux/fs_context.h> 39 + #include <linux/fserror.h> 39 40 #include <uapi/linux/mount.h> 40 41 #include "internal.h" 41 42 ··· 364 363 spin_lock_init(&s->s_inode_list_lock); 365 364 INIT_LIST_HEAD(&s->s_inodes_wb); 366 365 spin_lock_init(&s->s_inode_wblist_lock); 366 + fserror_mount(s); 367 367 368 368 s->s_count = 1; 369 369 atomic_set(&s->s_active, 1); ··· 624 622 sync_filesystem(sb); 625 623 sb->s_flags &= ~SB_ACTIVE; 626 624 625 + fserror_unmount(sb); 627 626 cgroup_writeback_umount(sb); 628 627 629 628 /* Evict all inodes with zero refcount. */

-2

fs/udf/udf_sb.h

··· 55 55 #define MF_DUPLICATE_MD 0x01 56 56 #define MF_MIRROR_FE_LOADED 0x02 57 57 58 - #define EFSCORRUPTED EUCLEAN 59 - 60 58 struct udf_meta_data { 61 59 __u32 s_meta_file_loc; 62 60 __u32 s_mirror_file_loc;

+2

fs/xfs/Makefile

··· 88 88 xfs_globals.o \ 89 89 xfs_handle.o \ 90 90 xfs_health.o \ 91 + xfs_healthmon.o \ 91 92 xfs_icache.o \ 92 93 xfs_ioctl.o \ 93 94 xfs_iomap.o \ ··· 106 105 xfs_symlink.o \ 107 106 xfs_sysfs.o \ 108 107 xfs_trans.o \ 108 + xfs_verify_media.o \ 109 109 xfs_xattr.o 110 110 111 111 # low-level transaction/log code

+189

fs/xfs/libxfs/xfs_fs.h

··· 1003 1003 #define XFS_RTGROUP_GEOM_SICK_RMAPBT (1U << 3) /* reverse mappings */ 1004 1004 #define XFS_RTGROUP_GEOM_SICK_REFCNTBT (1U << 4) /* reference counts */ 1005 1005 1006 + /* Health monitor event domains */ 1007 + 1008 + /* affects the whole fs */ 1009 + #define XFS_HEALTH_MONITOR_DOMAIN_MOUNT (0) 1010 + 1011 + /* metadata health events */ 1012 + #define XFS_HEALTH_MONITOR_DOMAIN_FS (1) 1013 + #define XFS_HEALTH_MONITOR_DOMAIN_AG (2) 1014 + #define XFS_HEALTH_MONITOR_DOMAIN_INODE (3) 1015 + #define XFS_HEALTH_MONITOR_DOMAIN_RTGROUP (4) 1016 + 1017 + /* disk events */ 1018 + #define XFS_HEALTH_MONITOR_DOMAIN_DATADEV (5) 1019 + #define XFS_HEALTH_MONITOR_DOMAIN_RTDEV (6) 1020 + #define XFS_HEALTH_MONITOR_DOMAIN_LOGDEV (7) 1021 + 1022 + /* file range events */ 1023 + #define XFS_HEALTH_MONITOR_DOMAIN_FILERANGE (8) 1024 + 1025 + /* Health monitor event types */ 1026 + 1027 + /* status of the monitor itself */ 1028 + #define XFS_HEALTH_MONITOR_TYPE_RUNNING (0) 1029 + #define XFS_HEALTH_MONITOR_TYPE_LOST (1) 1030 + 1031 + /* filesystem was unmounted */ 1032 + #define XFS_HEALTH_MONITOR_TYPE_UNMOUNT (2) 1033 + 1034 + /* metadata health events */ 1035 + #define XFS_HEALTH_MONITOR_TYPE_SICK (3) 1036 + #define XFS_HEALTH_MONITOR_TYPE_CORRUPT (4) 1037 + #define XFS_HEALTH_MONITOR_TYPE_HEALTHY (5) 1038 + 1039 + /* filesystem shutdown */ 1040 + #define XFS_HEALTH_MONITOR_TYPE_SHUTDOWN (6) 1041 + 1042 + /* media errors */ 1043 + #define XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR (7) 1044 + 1045 + /* pagecache I/O to a file range failed */ 1046 + #define XFS_HEALTH_MONITOR_TYPE_BUFREAD (8) 1047 + #define XFS_HEALTH_MONITOR_TYPE_BUFWRITE (9) 1048 + 1049 + /* direct I/O to a file range failed */ 1050 + #define XFS_HEALTH_MONITOR_TYPE_DIOREAD (10) 1051 + #define XFS_HEALTH_MONITOR_TYPE_DIOWRITE (11) 1052 + 1053 + /* out of band media error reported for a file range */ 1054 + #define XFS_HEALTH_MONITOR_TYPE_DATALOST (12) 1055 + 1056 + /* lost events */ 1057 + struct xfs_health_monitor_lost { 1058 + __u64 count; 1059 + }; 1060 + 1061 + /* fs/rt metadata */ 1062 + struct xfs_health_monitor_fs { 1063 + /* XFS_FSOP_GEOM_SICK_* flags */ 1064 + __u32 mask; 1065 + }; 1066 + 1067 + /* ag/rtgroup metadata */ 1068 + struct xfs_health_monitor_group { 1069 + /* XFS_{AG,RTGROUP}_SICK_* flags */ 1070 + __u32 mask; 1071 + __u32 gno; 1072 + }; 1073 + 1074 + /* inode metadata */ 1075 + struct xfs_health_monitor_inode { 1076 + /* XFS_BS_SICK_* flags */ 1077 + __u32 mask; 1078 + __u32 gen; 1079 + __u64 ino; 1080 + }; 1081 + 1082 + /* shutdown reasons */ 1083 + #define XFS_HEALTH_SHUTDOWN_META_IO_ERROR (1u << 0) 1084 + #define XFS_HEALTH_SHUTDOWN_LOG_IO_ERROR (1u << 1) 1085 + #define XFS_HEALTH_SHUTDOWN_FORCE_UMOUNT (1u << 2) 1086 + #define XFS_HEALTH_SHUTDOWN_CORRUPT_INCORE (1u << 3) 1087 + #define XFS_HEALTH_SHUTDOWN_CORRUPT_ONDISK (1u << 4) 1088 + #define XFS_HEALTH_SHUTDOWN_DEVICE_REMOVED (1u << 5) 1089 + 1090 + /* shutdown */ 1091 + struct xfs_health_monitor_shutdown { 1092 + /* XFS_HEALTH_SHUTDOWN_* flags */ 1093 + __u32 reasons; 1094 + }; 1095 + 1096 + /* file range events */ 1097 + struct xfs_health_monitor_filerange { 1098 + __u64 pos; 1099 + __u64 len; 1100 + __u64 ino; 1101 + __u32 gen; 1102 + __u32 error; 1103 + }; 1104 + 1105 + /* disk media errors */ 1106 + struct xfs_health_monitor_media { 1107 + __u64 daddr; 1108 + __u64 bbcount; 1109 + }; 1110 + 1111 + struct xfs_health_monitor_event { 1112 + /* XFS_HEALTH_MONITOR_DOMAIN_* */ 1113 + __u32 domain; 1114 + 1115 + /* XFS_HEALTH_MONITOR_TYPE_* */ 1116 + __u32 type; 1117 + 1118 + /* Timestamp of the event, in nanoseconds since the Unix epoch */ 1119 + __u64 time_ns; 1120 + 1121 + /* 1122 + * Details of the event. The primary clients are written in python 1123 + * and rust, so break this up because bindgen hates anonymous structs 1124 + * and unions. 1125 + */ 1126 + union { 1127 + struct xfs_health_monitor_lost lost; 1128 + struct xfs_health_monitor_fs fs; 1129 + struct xfs_health_monitor_group group; 1130 + struct xfs_health_monitor_inode inode; 1131 + struct xfs_health_monitor_shutdown shutdown; 1132 + struct xfs_health_monitor_media media; 1133 + struct xfs_health_monitor_filerange filerange; 1134 + } e; 1135 + 1136 + /* zeroes */ 1137 + __u64 pad[2]; 1138 + }; 1139 + 1140 + struct xfs_health_monitor { 1141 + __u64 flags; /* flags */ 1142 + __u8 format; /* output format */ 1143 + __u8 pad[23]; /* zeroes */ 1144 + }; 1145 + 1146 + /* Return all health status events, not just deltas */ 1147 + #define XFS_HEALTH_MONITOR_VERBOSE (1ULL << 0) 1148 + 1149 + #define XFS_HEALTH_MONITOR_ALL (XFS_HEALTH_MONITOR_VERBOSE) 1150 + 1151 + /* Initial return format version */ 1152 + #define XFS_HEALTH_MONITOR_FMT_V0 (0) 1153 + 1154 + /* 1155 + * Check that a given fd points to the same filesystem that the health monitor 1156 + * is monitoring. 1157 + */ 1158 + struct xfs_health_file_on_monitored_fs { 1159 + __s32 fd; 1160 + __u32 flags; /* zero for now */ 1161 + }; 1162 + 1163 + /* Verify the media of the underlying devices */ 1164 + struct xfs_verify_media { 1165 + __u32 me_dev; /* I: XFS_DEV_{DATA,LOG,RT} */ 1166 + __u32 me_flags; /* I: XFS_VERIFY_MEDIA_* */ 1167 + 1168 + /* 1169 + * IO: inclusive start of disk range to verify, in 512b blocks. 1170 + * Will be adjusted upwards as media verification succeeds. 1171 + */ 1172 + __u64 me_start_daddr; 1173 + 1174 + /* 1175 + * IO: exclusive end of the disk range to verify, in 512b blocks. 1176 + * Can be adjusted downwards to match device size. 1177 + */ 1178 + __u64 me_end_daddr; 1179 + 1180 + __u32 me_ioerror; /* O: I/O error (positive) */ 1181 + __u32 me_max_io_size; /* I: maximum IO size in bytes */ 1182 + 1183 + __u32 me_rest_us; /* I: rest time between IOs, usecs */ 1184 + __u32 me_pad; /* zero */ 1185 + }; 1186 + 1187 + #define XFS_VERIFY_MEDIA_REPORT (1 << 0) /* report to fsnotify */ 1188 + 1189 + #define XFS_VERIFY_MEDIA_FLAGS (XFS_VERIFY_MEDIA_REPORT) 1190 + 1006 1191 /* 1007 1192 * ioctl commands that are used by Linux filesystems 1008 1193 */ ··· 1227 1042 #define XFS_IOC_GETPARENTS_BY_HANDLE _IOWR('X', 63, struct xfs_getparents_by_handle) 1228 1043 #define XFS_IOC_SCRUBV_METADATA _IOWR('X', 64, struct xfs_scrub_vec_head) 1229 1044 #define XFS_IOC_RTGROUP_GEOMETRY _IOWR('X', 65, struct xfs_rtgroup_geometry) 1045 + #define XFS_IOC_HEALTH_MONITOR _IOW ('X', 68, struct xfs_health_monitor) 1046 + #define XFS_IOC_HEALTH_FD_ON_MONITORED_FS \ 1047 + _IOW ('X', 69, struct xfs_health_file_on_monitored_fs) 1048 + #define XFS_IOC_VERIFY_MEDIA _IOWR('X', 70, struct xfs_verify_media) 1230 1049 1231 1050 /* 1232 1051 * ioctl commands that replace IRIX syssgi()'s

+5

fs/xfs/libxfs/xfs_health.h

··· 289 289 #define xfs_metadata_is_sick(error) \ 290 290 (unlikely((error) == -EFSCORRUPTED || (error) == -EFSBADCRC)) 291 291 292 + unsigned int xfs_healthmon_inode_mask(unsigned int sick_mask); 293 + unsigned int xfs_healthmon_rtgroup_mask(unsigned int sick_mask); 294 + unsigned int xfs_healthmon_perag_mask(unsigned int sick_mask); 295 + unsigned int xfs_healthmon_fs_mask(unsigned int sick_mask); 296 + 292 297 #endif /* __XFS_HEALTH_H__ */

+6

fs/xfs/xfs_fsops.c

··· 25 25 #include "xfs_rtrmap_btree.h" 26 26 #include "xfs_rtrefcount_btree.h" 27 27 #include "xfs_metafile.h" 28 + #include "xfs_healthmon.h" 29 + 30 + #include <linux/fserror.h> 28 31 29 32 /* 30 33 * Write new AG headers to disk. Non-transactional, but need to be ··· 543 540 "Please unmount the filesystem and rectify the problem(s)"); 544 541 if (xfs_error_level >= XFS_ERRLEVEL_HIGH) 545 542 xfs_stack_trace(); 543 + 544 + fserror_report_shutdown(mp->m_super, GFP_KERNEL); 545 + xfs_healthmon_report_shutdown(mp, flags); 546 546 } 547 547 548 548 /*

+138

fs/xfs/xfs_health.c

··· 19 19 #include "xfs_da_btree.h" 20 20 #include "xfs_quota_defs.h" 21 21 #include "xfs_rtgroup.h" 22 + #include "xfs_healthmon.h" 23 + 24 + #include <linux/fserror.h> 22 25 23 26 static void 24 27 xfs_health_unmount_group( ··· 108 105 struct xfs_mount *mp, 109 106 unsigned int mask) 110 107 { 108 + unsigned int old_mask; 109 + 111 110 ASSERT(!(mask & ~XFS_SICK_FS_ALL)); 112 111 trace_xfs_fs_mark_sick(mp, mask); 113 112 114 113 spin_lock(&mp->m_sb_lock); 114 + old_mask = mp->m_fs_sick; 115 115 mp->m_fs_sick |= mask; 116 116 spin_unlock(&mp->m_sb_lock); 117 + 118 + fserror_report_metadata(mp->m_super, -EFSCORRUPTED, GFP_NOFS); 119 + if (mask) 120 + xfs_healthmon_report_fs(mp, XFS_HEALTHMON_SICK, old_mask, mask); 117 121 } 118 122 119 123 /* Mark per-fs metadata as having been checked and found unhealthy by fsck. */ ··· 129 119 struct xfs_mount *mp, 130 120 unsigned int mask) 131 121 { 122 + unsigned int old_mask; 123 + 132 124 ASSERT(!(mask & ~XFS_SICK_FS_ALL)); 133 125 trace_xfs_fs_mark_corrupt(mp, mask); 134 126 135 127 spin_lock(&mp->m_sb_lock); 128 + old_mask = mp->m_fs_sick; 136 129 mp->m_fs_sick |= mask; 137 130 mp->m_fs_checked |= mask; 138 131 spin_unlock(&mp->m_sb_lock); 132 + 133 + fserror_report_metadata(mp->m_super, -EFSCORRUPTED, GFP_NOFS); 134 + if (mask) 135 + xfs_healthmon_report_fs(mp, XFS_HEALTHMON_CORRUPT, old_mask, 136 + mask); 139 137 } 140 138 141 139 /* Mark a per-fs metadata healed. */ ··· 152 134 struct xfs_mount *mp, 153 135 unsigned int mask) 154 136 { 137 + unsigned int old_mask; 138 + 155 139 ASSERT(!(mask & ~XFS_SICK_FS_ALL)); 156 140 trace_xfs_fs_mark_healthy(mp, mask); 157 141 158 142 spin_lock(&mp->m_sb_lock); 143 + old_mask = mp->m_fs_sick; 159 144 mp->m_fs_sick &= ~mask; 160 145 if (!(mp->m_fs_sick & XFS_SICK_FS_PRIMARY)) 161 146 mp->m_fs_sick &= ~XFS_SICK_FS_SECONDARY; 162 147 mp->m_fs_checked |= mask; 163 148 spin_unlock(&mp->m_sb_lock); 149 + 150 + if (mask) 151 + xfs_healthmon_report_fs(mp, XFS_HEALTHMON_HEALTHY, old_mask, 152 + mask); 164 153 } 165 154 166 155 /* Sample which per-fs metadata are unhealthy. */ ··· 217 192 struct xfs_group *xg, 218 193 unsigned int mask) 219 194 { 195 + unsigned int old_mask; 196 + 220 197 xfs_group_check_mask(xg, mask); 221 198 trace_xfs_group_mark_sick(xg, mask); 222 199 223 200 spin_lock(&xg->xg_state_lock); 201 + old_mask = xg->xg_sick; 224 202 xg->xg_sick |= mask; 225 203 spin_unlock(&xg->xg_state_lock); 204 + 205 + fserror_report_metadata(xg->xg_mount->m_super, -EFSCORRUPTED, GFP_NOFS); 206 + if (mask) 207 + xfs_healthmon_report_group(xg, XFS_HEALTHMON_SICK, old_mask, 208 + mask); 226 209 } 227 210 228 211 /* ··· 241 208 struct xfs_group *xg, 242 209 unsigned int mask) 243 210 { 211 + unsigned int old_mask; 212 + 244 213 xfs_group_check_mask(xg, mask); 245 214 trace_xfs_group_mark_corrupt(xg, mask); 246 215 247 216 spin_lock(&xg->xg_state_lock); 217 + old_mask = xg->xg_sick; 248 218 xg->xg_sick |= mask; 249 219 xg->xg_checked |= mask; 250 220 spin_unlock(&xg->xg_state_lock); 221 + 222 + fserror_report_metadata(xg->xg_mount->m_super, -EFSCORRUPTED, GFP_NOFS); 223 + if (mask) 224 + xfs_healthmon_report_group(xg, XFS_HEALTHMON_CORRUPT, old_mask, 225 + mask); 251 226 } 252 227 253 228 /* ··· 266 225 struct xfs_group *xg, 267 226 unsigned int mask) 268 227 { 228 + unsigned int old_mask; 229 + 269 230 xfs_group_check_mask(xg, mask); 270 231 trace_xfs_group_mark_healthy(xg, mask); 271 232 272 233 spin_lock(&xg->xg_state_lock); 234 + old_mask = xg->xg_sick; 273 235 xg->xg_sick &= ~mask; 274 236 if (!(xg->xg_sick & XFS_SICK_AG_PRIMARY)) 275 237 xg->xg_sick &= ~XFS_SICK_AG_SECONDARY; 276 238 xg->xg_checked |= mask; 277 239 spin_unlock(&xg->xg_state_lock); 240 + 241 + if (mask) 242 + xfs_healthmon_report_group(xg, XFS_HEALTHMON_HEALTHY, old_mask, 243 + mask); 278 244 } 279 245 280 246 /* Sample which per-ag metadata are unhealthy. */ ··· 320 272 struct xfs_inode *ip, 321 273 unsigned int mask) 322 274 { 275 + unsigned int old_mask; 276 + 323 277 ASSERT(!(mask & ~XFS_SICK_INO_ALL)); 324 278 trace_xfs_inode_mark_sick(ip, mask); 325 279 326 280 spin_lock(&ip->i_flags_lock); 281 + old_mask = ip->i_sick; 327 282 ip->i_sick |= mask; 328 283 spin_unlock(&ip->i_flags_lock); 329 284 ··· 338 287 spin_lock(&VFS_I(ip)->i_lock); 339 288 inode_state_clear(VFS_I(ip), I_DONTCACHE); 340 289 spin_unlock(&VFS_I(ip)->i_lock); 290 + 291 + fserror_report_file_metadata(VFS_I(ip), -EFSCORRUPTED, GFP_NOFS); 292 + if (mask) 293 + xfs_healthmon_report_inode(ip, XFS_HEALTHMON_SICK, old_mask, 294 + mask); 341 295 } 342 296 343 297 /* Mark inode metadata as having been checked and found unhealthy by fsck. */ ··· 351 295 struct xfs_inode *ip, 352 296 unsigned int mask) 353 297 { 298 + unsigned int old_mask; 299 + 354 300 ASSERT(!(mask & ~XFS_SICK_INO_ALL)); 355 301 trace_xfs_inode_mark_corrupt(ip, mask); 356 302 357 303 spin_lock(&ip->i_flags_lock); 304 + old_mask = ip->i_sick; 358 305 ip->i_sick |= mask; 359 306 ip->i_checked |= mask; 360 307 spin_unlock(&ip->i_flags_lock); ··· 370 311 spin_lock(&VFS_I(ip)->i_lock); 371 312 inode_state_clear(VFS_I(ip), I_DONTCACHE); 372 313 spin_unlock(&VFS_I(ip)->i_lock); 314 + 315 + fserror_report_file_metadata(VFS_I(ip), -EFSCORRUPTED, GFP_NOFS); 316 + if (mask) 317 + xfs_healthmon_report_inode(ip, XFS_HEALTHMON_CORRUPT, old_mask, 318 + mask); 373 319 } 374 320 375 321 /* Mark parts of an inode healed. */ ··· 383 319 struct xfs_inode *ip, 384 320 unsigned int mask) 385 321 { 322 + unsigned int old_mask; 323 + 386 324 ASSERT(!(mask & ~XFS_SICK_INO_ALL)); 387 325 trace_xfs_inode_mark_healthy(ip, mask); 388 326 389 327 spin_lock(&ip->i_flags_lock); 328 + old_mask = ip->i_sick; 390 329 ip->i_sick &= ~mask; 391 330 if (!(ip->i_sick & XFS_SICK_INO_PRIMARY)) 392 331 ip->i_sick &= ~XFS_SICK_INO_SECONDARY; 393 332 ip->i_checked |= mask; 394 333 spin_unlock(&ip->i_flags_lock); 334 + 335 + if (mask) 336 + xfs_healthmon_report_inode(ip, XFS_HEALTHMON_HEALTHY, old_mask, 337 + mask); 395 338 } 396 339 397 340 /* Sample which parts of an inode are unhealthy. */ ··· 478 407 } 479 408 } 480 409 410 + /* 411 + * Translate XFS_SICK_FS_* into XFS_FSOP_GEOM_SICK_* except for the rt free 412 + * space codes, which are sent via the rtgroup events. 413 + */ 414 + unsigned int 415 + xfs_healthmon_fs_mask( 416 + unsigned int sick_mask) 417 + { 418 + const struct ioctl_sick_map *m; 419 + unsigned int ioctl_mask = 0; 420 + 421 + for_each_sick_map(fs_map, m) { 422 + if (sick_mask & m->sick_mask) 423 + ioctl_mask |= m->ioctl_mask; 424 + } 425 + 426 + return ioctl_mask; 427 + } 428 + 481 429 static const struct ioctl_sick_map ag_map[] = { 482 430 { XFS_SICK_AG_SB, XFS_AG_GEOM_SICK_SB }, 483 431 { XFS_SICK_AG_AGF, XFS_AG_GEOM_SICK_AGF }, ··· 533 443 } 534 444 } 535 445 446 + /* Translate XFS_SICK_AG_* into XFS_AG_GEOM_SICK_*. */ 447 + unsigned int 448 + xfs_healthmon_perag_mask( 449 + unsigned int sick_mask) 450 + { 451 + const struct ioctl_sick_map *m; 452 + unsigned int ioctl_mask = 0; 453 + 454 + for_each_sick_map(ag_map, m) { 455 + if (sick_mask & m->sick_mask) 456 + ioctl_mask |= m->ioctl_mask; 457 + } 458 + 459 + return ioctl_mask; 460 + } 461 + 536 462 static const struct ioctl_sick_map rtgroup_map[] = { 537 463 { XFS_SICK_RG_SUPER, XFS_RTGROUP_GEOM_SICK_SUPER }, 538 464 { XFS_SICK_RG_BITMAP, XFS_RTGROUP_GEOM_SICK_BITMAP }, ··· 577 471 if (sick & m->sick_mask) 578 472 rgeo->rg_sick |= m->ioctl_mask; 579 473 } 474 + } 475 + 476 + /* Translate XFS_SICK_RG_* into XFS_RTGROUP_GEOM_SICK_*. */ 477 + unsigned int 478 + xfs_healthmon_rtgroup_mask( 479 + unsigned int sick_mask) 480 + { 481 + const struct ioctl_sick_map *m; 482 + unsigned int ioctl_mask = 0; 483 + 484 + for_each_sick_map(rtgroup_map, m) { 485 + if (sick_mask & m->sick_mask) 486 + ioctl_mask |= m->ioctl_mask; 487 + } 488 + 489 + return ioctl_mask; 580 490 } 581 491 582 492 static const struct ioctl_sick_map ino_map[] = { ··· 631 509 if (sick & m->sick_mask) 632 510 bs->bs_sick |= m->ioctl_mask; 633 511 } 512 + } 513 + 514 + /* Translate XFS_SICK_INO_* into XFS_BS_SICK_*. */ 515 + unsigned int 516 + xfs_healthmon_inode_mask( 517 + unsigned int sick_mask) 518 + { 519 + const struct ioctl_sick_map *m; 520 + unsigned int ioctl_mask = 0; 521 + 522 + for_each_sick_map(ino_map, m) { 523 + if (sick_mask & m->sick_mask) 524 + ioctl_mask |= m->ioctl_mask; 525 + } 526 + 527 + return ioctl_mask; 634 528 } 635 529 636 530 /* Mark a block mapping sick. */

+1255

fs/xfs/xfs_healthmon.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Copyright (c) 2024-2026 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <djwong@kernel.org> 5 + */ 6 + #include "xfs_platform.h" 7 + #include "xfs_fs.h" 8 + #include "xfs_shared.h" 9 + #include "xfs_format.h" 10 + #include "xfs_log_format.h" 11 + #include "xfs_trans_resv.h" 12 + #include "xfs_mount.h" 13 + #include "xfs_inode.h" 14 + #include "xfs_trace.h" 15 + #include "xfs_ag.h" 16 + #include "xfs_btree.h" 17 + #include "xfs_da_format.h" 18 + #include "xfs_da_btree.h" 19 + #include "xfs_quota_defs.h" 20 + #include "xfs_rtgroup.h" 21 + #include "xfs_health.h" 22 + #include "xfs_healthmon.h" 23 + #include "xfs_fsops.h" 24 + #include "xfs_notify_failure.h" 25 + #include "xfs_file.h" 26 + #include "xfs_ioctl.h" 27 + 28 + #include <linux/anon_inodes.h> 29 + #include <linux/eventpoll.h> 30 + #include <linux/poll.h> 31 + #include <linux/fserror.h> 32 + 33 + /* 34 + * Live Health Monitoring 35 + * ====================== 36 + * 37 + * Autonomous self-healing of XFS filesystems requires a means for the kernel 38 + * to send filesystem health events to a monitoring daemon in userspace. To 39 + * accomplish this, we establish a thread_with_file kthread object to handle 40 + * translating internal events about filesystem health into a format that can 41 + * be parsed easily by userspace. When those internal events occur, the core 42 + * filesystem code calls this health monitor to convey the events to userspace. 43 + * Userspace reads events from the file descriptor returned by the ioctl. 44 + * 45 + * The healthmon abstraction has a weak reference to the host filesystem mount 46 + * so that the queueing and processing of the events do not pin the mount and 47 + * cannot slow down the main filesystem. The healthmon object can exist past 48 + * the end of the filesystem mount. 49 + */ 50 + 51 + /* sign of a detached health monitor */ 52 + #define DETACHED_MOUNT_COOKIE ((uintptr_t)0) 53 + 54 + /* Constrain the number of event objects that can build up in memory. */ 55 + #define XFS_HEALTHMON_MAX_EVENTS (SZ_32K / \ 56 + sizeof(struct xfs_healthmon_event)) 57 + 58 + /* Constrain the size of the output buffer for read_iter. */ 59 + #define XFS_HEALTHMON_MAX_OUTBUF SZ_64K 60 + 61 + /* spinlock for atomically updating xfs_mount <-> xfs_healthmon pointers */ 62 + static DEFINE_SPINLOCK(xfs_healthmon_lock); 63 + 64 + /* Grab a reference to the healthmon object for a given mount, if any. */ 65 + static struct xfs_healthmon * 66 + xfs_healthmon_get( 67 + struct xfs_mount *mp) 68 + { 69 + struct xfs_healthmon *hm; 70 + 71 + rcu_read_lock(); 72 + hm = mp->m_healthmon; 73 + if (hm && !refcount_inc_not_zero(&hm->ref)) 74 + hm = NULL; 75 + rcu_read_unlock(); 76 + 77 + return hm; 78 + } 79 + 80 + /* 81 + * Release the reference to a healthmon object. If there are no more holders, 82 + * free the health monitor after an RCU grace period to eliminate possibility 83 + * of races with xfs_healthmon_get. 84 + */ 85 + static void 86 + xfs_healthmon_put( 87 + struct xfs_healthmon *hm) 88 + { 89 + if (refcount_dec_and_test(&hm->ref)) { 90 + struct xfs_healthmon_event *event; 91 + struct xfs_healthmon_event *next = hm->first_event; 92 + 93 + while ((event = next) != NULL) { 94 + trace_xfs_healthmon_drop(hm, event); 95 + next = event->next; 96 + kfree(event); 97 + } 98 + 99 + kfree(hm->unmount_event); 100 + kfree(hm->buffer); 101 + mutex_destroy(&hm->lock); 102 + kfree_rcu_mightsleep(hm); 103 + } 104 + } 105 + 106 + /* Attach a health monitor to an xfs_mount. Only one allowed at a time. */ 107 + STATIC int 108 + xfs_healthmon_attach( 109 + struct xfs_mount *mp, 110 + struct xfs_healthmon *hm) 111 + { 112 + spin_lock(&xfs_healthmon_lock); 113 + if (mp->m_healthmon != NULL) { 114 + spin_unlock(&xfs_healthmon_lock); 115 + return -EEXIST; 116 + } 117 + 118 + refcount_inc(&hm->ref); 119 + mp->m_healthmon = hm; 120 + hm->mount_cookie = (uintptr_t)mp->m_super; 121 + spin_unlock(&xfs_healthmon_lock); 122 + 123 + return 0; 124 + } 125 + 126 + /* Detach a xfs mount from a specific healthmon instance. */ 127 + STATIC void 128 + xfs_healthmon_detach( 129 + struct xfs_healthmon *hm) 130 + { 131 + spin_lock(&xfs_healthmon_lock); 132 + if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) { 133 + spin_unlock(&xfs_healthmon_lock); 134 + return; 135 + } 136 + 137 + XFS_M((struct super_block *)hm->mount_cookie)->m_healthmon = NULL; 138 + hm->mount_cookie = DETACHED_MOUNT_COOKIE; 139 + spin_unlock(&xfs_healthmon_lock); 140 + 141 + trace_xfs_healthmon_detach(hm); 142 + xfs_healthmon_put(hm); 143 + } 144 + 145 + static inline void xfs_healthmon_bump_events(struct xfs_healthmon *hm) 146 + { 147 + hm->events++; 148 + hm->total_events++; 149 + } 150 + 151 + static inline void xfs_healthmon_bump_lost(struct xfs_healthmon *hm) 152 + { 153 + hm->lost_prev_event++; 154 + hm->total_lost++; 155 + } 156 + 157 + /* 158 + * If possible, merge a new event into an existing event. Returns whether or 159 + * not it merged anything. 160 + */ 161 + static bool 162 + xfs_healthmon_merge_events( 163 + struct xfs_healthmon_event *existing, 164 + const struct xfs_healthmon_event *new) 165 + { 166 + if (!existing) 167 + return false; 168 + 169 + /* type and domain must match to merge events */ 170 + if (existing->type != new->type || 171 + existing->domain != new->domain) 172 + return false; 173 + 174 + switch (existing->type) { 175 + case XFS_HEALTHMON_RUNNING: 176 + case XFS_HEALTHMON_UNMOUNT: 177 + /* should only ever be one of these events anyway */ 178 + return false; 179 + 180 + case XFS_HEALTHMON_LOST: 181 + existing->lostcount += new->lostcount; 182 + return true; 183 + 184 + case XFS_HEALTHMON_SICK: 185 + case XFS_HEALTHMON_CORRUPT: 186 + case XFS_HEALTHMON_HEALTHY: 187 + switch (existing->domain) { 188 + case XFS_HEALTHMON_FS: 189 + existing->fsmask |= new->fsmask; 190 + return true; 191 + case XFS_HEALTHMON_AG: 192 + case XFS_HEALTHMON_RTGROUP: 193 + if (existing->group == new->group){ 194 + existing->grpmask |= new->grpmask; 195 + return true; 196 + } 197 + return false; 198 + case XFS_HEALTHMON_INODE: 199 + if (existing->ino == new->ino && 200 + existing->gen == new->gen) { 201 + existing->imask |= new->imask; 202 + return true; 203 + } 204 + return false; 205 + default: 206 + ASSERT(0); 207 + return false; 208 + } 209 + return false; 210 + 211 + case XFS_HEALTHMON_SHUTDOWN: 212 + /* yes, we can race to shutdown */ 213 + existing->flags |= new->flags; 214 + return true; 215 + 216 + case XFS_HEALTHMON_MEDIA_ERROR: 217 + /* physically adjacent errors can merge */ 218 + if (existing->daddr + existing->bbcount == new->daddr) { 219 + existing->bbcount += new->bbcount; 220 + return true; 221 + } 222 + if (new->daddr + new->bbcount == existing->daddr) { 223 + existing->daddr = new->daddr; 224 + existing->bbcount += new->bbcount; 225 + return true; 226 + } 227 + return false; 228 + 229 + case XFS_HEALTHMON_BUFREAD: 230 + case XFS_HEALTHMON_BUFWRITE: 231 + case XFS_HEALTHMON_DIOREAD: 232 + case XFS_HEALTHMON_DIOWRITE: 233 + case XFS_HEALTHMON_DATALOST: 234 + /* logically adjacent file ranges can merge */ 235 + if (existing->fino != new->fino || existing->fgen != new->fgen) 236 + return false; 237 + 238 + if (existing->fpos + existing->flen == new->fpos) { 239 + existing->flen += new->flen; 240 + return true; 241 + } 242 + 243 + if (new->fpos + new->flen == existing->fpos) { 244 + existing->fpos = new->fpos; 245 + existing->flen += new->flen; 246 + return true; 247 + } 248 + return false; 249 + } 250 + 251 + return false; 252 + } 253 + 254 + /* Insert an event onto the start of the queue. */ 255 + static inline void 256 + __xfs_healthmon_insert( 257 + struct xfs_healthmon *hm, 258 + struct xfs_healthmon_event *event) 259 + { 260 + struct timespec64 now; 261 + 262 + ktime_get_coarse_real_ts64(&now); 263 + event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec; 264 + 265 + event->next = hm->first_event; 266 + if (!hm->first_event) 267 + hm->first_event = event; 268 + if (!hm->last_event) 269 + hm->last_event = event; 270 + xfs_healthmon_bump_events(hm); 271 + wake_up(&hm->wait); 272 + 273 + trace_xfs_healthmon_insert(hm, event); 274 + } 275 + 276 + /* Push an event onto the end of the queue. */ 277 + static inline void 278 + __xfs_healthmon_push( 279 + struct xfs_healthmon *hm, 280 + struct xfs_healthmon_event *event) 281 + { 282 + struct timespec64 now; 283 + 284 + ktime_get_coarse_real_ts64(&now); 285 + event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec; 286 + 287 + if (!hm->first_event) 288 + hm->first_event = event; 289 + if (hm->last_event) 290 + hm->last_event->next = event; 291 + hm->last_event = event; 292 + event->next = NULL; 293 + xfs_healthmon_bump_events(hm); 294 + wake_up(&hm->wait); 295 + 296 + trace_xfs_healthmon_push(hm, event); 297 + } 298 + 299 + /* Deal with any previously lost events */ 300 + static int 301 + xfs_healthmon_clear_lost_prev( 302 + struct xfs_healthmon *hm) 303 + { 304 + struct xfs_healthmon_event lost_event = { 305 + .type = XFS_HEALTHMON_LOST, 306 + .domain = XFS_HEALTHMON_MOUNT, 307 + .lostcount = hm->lost_prev_event, 308 + }; 309 + struct xfs_healthmon_event *event = NULL; 310 + 311 + if (xfs_healthmon_merge_events(hm->last_event, &lost_event)) { 312 + trace_xfs_healthmon_merge(hm, hm->last_event); 313 + wake_up(&hm->wait); 314 + goto cleared; 315 + } 316 + 317 + if (hm->events < XFS_HEALTHMON_MAX_EVENTS) 318 + event = kmemdup(&lost_event, sizeof(struct xfs_healthmon_event), 319 + GFP_NOFS); 320 + if (!event) 321 + return -ENOMEM; 322 + 323 + __xfs_healthmon_push(hm, event); 324 + cleared: 325 + hm->lost_prev_event = 0; 326 + return 0; 327 + } 328 + 329 + /* 330 + * Push an event onto the end of the list after dealing with lost events and 331 + * possibly full queues. 332 + */ 333 + STATIC int 334 + xfs_healthmon_push( 335 + struct xfs_healthmon *hm, 336 + const struct xfs_healthmon_event *template) 337 + { 338 + struct xfs_healthmon_event *event = NULL; 339 + int error = 0; 340 + 341 + /* 342 + * Locklessly check if the health monitor has already detached from the 343 + * mount. If so, ignore the event. If we race with deactivation, 344 + * we'll queue the event but never send it. 345 + */ 346 + if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) 347 + return -ESHUTDOWN; 348 + 349 + mutex_lock(&hm->lock); 350 + 351 + /* Report previously lost events before we do anything else */ 352 + if (hm->lost_prev_event) { 353 + error = xfs_healthmon_clear_lost_prev(hm); 354 + if (error) 355 + goto out_unlock; 356 + } 357 + 358 + /* Try to merge with the newest event */ 359 + if (xfs_healthmon_merge_events(hm->last_event, template)) { 360 + trace_xfs_healthmon_merge(hm, hm->last_event); 361 + wake_up(&hm->wait); 362 + goto out_unlock; 363 + } 364 + 365 + /* Only create a heap event object if we're not already at capacity. */ 366 + if (hm->events < XFS_HEALTHMON_MAX_EVENTS) 367 + event = kmemdup(template, sizeof(struct xfs_healthmon_event), 368 + GFP_NOFS); 369 + if (!event) { 370 + /* No memory means we lose the event */ 371 + trace_xfs_healthmon_lost_event(hm); 372 + xfs_healthmon_bump_lost(hm); 373 + error = -ENOMEM; 374 + goto out_unlock; 375 + } 376 + 377 + __xfs_healthmon_push(hm, event); 378 + 379 + out_unlock: 380 + mutex_unlock(&hm->lock); 381 + return error; 382 + } 383 + 384 + /* 385 + * Report that the filesystem is being unmounted, then detach the xfs mount 386 + * from this healthmon instance. 387 + */ 388 + void 389 + xfs_healthmon_unmount( 390 + struct xfs_mount *mp) 391 + { 392 + struct xfs_healthmon *hm = xfs_healthmon_get(mp); 393 + 394 + if (!hm) 395 + return; 396 + 397 + trace_xfs_healthmon_report_unmount(hm); 398 + 399 + /* 400 + * Insert the unmount notification at the start of the event queue so 401 + * that userspace knows the filesystem went away as soon as possible. 402 + * There's nothing actionable for userspace after an unmount. Once 403 + * we've inserted the unmount event, hm no longer owns that event. 404 + */ 405 + __xfs_healthmon_insert(hm, hm->unmount_event); 406 + hm->unmount_event = NULL; 407 + 408 + xfs_healthmon_detach(hm); 409 + xfs_healthmon_put(hm); 410 + } 411 + 412 + /* Compute the reporting mask for non-unmount metadata health events. */ 413 + static inline unsigned int 414 + metadata_event_mask( 415 + struct xfs_healthmon *hm, 416 + enum xfs_healthmon_type type, 417 + unsigned int old_mask, 418 + unsigned int new_mask) 419 + { 420 + /* If we want all events, return all events. */ 421 + if (hm->verbose) 422 + return new_mask; 423 + 424 + switch (type) { 425 + case XFS_HEALTHMON_SICK: 426 + /* Always report runtime corruptions */ 427 + return new_mask; 428 + case XFS_HEALTHMON_CORRUPT: 429 + /* Only report new fsck errors */ 430 + return new_mask & ~old_mask; 431 + case XFS_HEALTHMON_HEALTHY: 432 + /* Only report healthy metadata that got fixed */ 433 + return new_mask & old_mask; 434 + default: 435 + ASSERT(0); 436 + break; 437 + } 438 + 439 + return 0; 440 + } 441 + 442 + /* Report XFS_FS_SICK_* events to healthmon */ 443 + void 444 + xfs_healthmon_report_fs( 445 + struct xfs_mount *mp, 446 + enum xfs_healthmon_type type, 447 + unsigned int old_mask, 448 + unsigned int new_mask) 449 + { 450 + struct xfs_healthmon_event event = { 451 + .type = type, 452 + .domain = XFS_HEALTHMON_FS, 453 + }; 454 + struct xfs_healthmon *hm = xfs_healthmon_get(mp); 455 + 456 + if (!hm) 457 + return; 458 + 459 + event.fsmask = metadata_event_mask(hm, type, old_mask, new_mask) & 460 + ~XFS_SICK_FS_SECONDARY; 461 + trace_xfs_healthmon_report_fs(hm, old_mask, new_mask, &event); 462 + 463 + if (event.fsmask) 464 + xfs_healthmon_push(hm, &event); 465 + 466 + xfs_healthmon_put(hm); 467 + } 468 + 469 + /* Report XFS_SICK_(AG|RG)* flags to healthmon */ 470 + void 471 + xfs_healthmon_report_group( 472 + struct xfs_group *xg, 473 + enum xfs_healthmon_type type, 474 + unsigned int old_mask, 475 + unsigned int new_mask) 476 + { 477 + struct xfs_healthmon_event event = { 478 + .type = type, 479 + .group = xg->xg_gno, 480 + }; 481 + struct xfs_healthmon *hm = xfs_healthmon_get(xg->xg_mount); 482 + 483 + if (!hm) 484 + return; 485 + 486 + switch (xg->xg_type) { 487 + case XG_TYPE_RTG: 488 + event.domain = XFS_HEALTHMON_RTGROUP; 489 + event.grpmask = metadata_event_mask(hm, type, old_mask, 490 + new_mask) & 491 + ~XFS_SICK_RG_SECONDARY; 492 + break; 493 + case XG_TYPE_AG: 494 + event.domain = XFS_HEALTHMON_AG; 495 + event.grpmask = metadata_event_mask(hm, type, old_mask, 496 + new_mask) & 497 + ~XFS_SICK_AG_SECONDARY; 498 + break; 499 + default: 500 + ASSERT(0); 501 + break; 502 + } 503 + 504 + trace_xfs_healthmon_report_group(hm, old_mask, new_mask, &event); 505 + 506 + if (event.grpmask) 507 + xfs_healthmon_push(hm, &event); 508 + 509 + xfs_healthmon_put(hm); 510 + } 511 + 512 + /* Report XFS_SICK_INO_* flags to healthmon */ 513 + void 514 + xfs_healthmon_report_inode( 515 + struct xfs_inode *ip, 516 + enum xfs_healthmon_type type, 517 + unsigned int old_mask, 518 + unsigned int new_mask) 519 + { 520 + struct xfs_healthmon_event event = { 521 + .type = type, 522 + .domain = XFS_HEALTHMON_INODE, 523 + .ino = ip->i_ino, 524 + .gen = VFS_I(ip)->i_generation, 525 + }; 526 + struct xfs_healthmon *hm = xfs_healthmon_get(ip->i_mount); 527 + 528 + if (!hm) 529 + return; 530 + 531 + event.imask = metadata_event_mask(hm, type, old_mask, new_mask) & 532 + ~XFS_SICK_INO_SECONDARY; 533 + trace_xfs_healthmon_report_inode(hm, old_mask, event.imask, &event); 534 + 535 + if (event.imask) 536 + xfs_healthmon_push(hm, &event); 537 + 538 + xfs_healthmon_put(hm); 539 + } 540 + 541 + /* Add a shutdown event to the reporting queue. */ 542 + void 543 + xfs_healthmon_report_shutdown( 544 + struct xfs_mount *mp, 545 + uint32_t flags) 546 + { 547 + struct xfs_healthmon_event event = { 548 + .type = XFS_HEALTHMON_SHUTDOWN, 549 + .domain = XFS_HEALTHMON_MOUNT, 550 + .flags = flags, 551 + }; 552 + struct xfs_healthmon *hm = xfs_healthmon_get(mp); 553 + 554 + if (!hm) 555 + return; 556 + 557 + trace_xfs_healthmon_report_shutdown(hm, flags); 558 + 559 + xfs_healthmon_push(hm, &event); 560 + xfs_healthmon_put(hm); 561 + } 562 + 563 + static inline enum xfs_healthmon_domain 564 + media_error_domain( 565 + enum xfs_device fdev) 566 + { 567 + switch (fdev) { 568 + case XFS_DEV_DATA: 569 + return XFS_HEALTHMON_DATADEV; 570 + case XFS_DEV_LOG: 571 + return XFS_HEALTHMON_LOGDEV; 572 + case XFS_DEV_RT: 573 + return XFS_HEALTHMON_RTDEV; 574 + } 575 + 576 + ASSERT(0); 577 + return 0; 578 + } 579 + 580 + /* Add a media error event to the reporting queue. */ 581 + void 582 + xfs_healthmon_report_media( 583 + struct xfs_mount *mp, 584 + enum xfs_device fdev, 585 + xfs_daddr_t daddr, 586 + uint64_t bbcount) 587 + { 588 + struct xfs_healthmon_event event = { 589 + .type = XFS_HEALTHMON_MEDIA_ERROR, 590 + .domain = media_error_domain(fdev), 591 + .daddr = daddr, 592 + .bbcount = bbcount, 593 + }; 594 + struct xfs_healthmon *hm = xfs_healthmon_get(mp); 595 + 596 + if (!hm) 597 + return; 598 + 599 + trace_xfs_healthmon_report_media(hm, fdev, &event); 600 + 601 + xfs_healthmon_push(hm, &event); 602 + xfs_healthmon_put(hm); 603 + } 604 + 605 + static inline enum xfs_healthmon_type file_ioerr_type(enum fserror_type action) 606 + { 607 + switch (action) { 608 + case FSERR_BUFFERED_READ: 609 + return XFS_HEALTHMON_BUFREAD; 610 + case FSERR_BUFFERED_WRITE: 611 + return XFS_HEALTHMON_BUFWRITE; 612 + case FSERR_DIRECTIO_READ: 613 + return XFS_HEALTHMON_DIOREAD; 614 + case FSERR_DIRECTIO_WRITE: 615 + return XFS_HEALTHMON_DIOWRITE; 616 + case FSERR_DATA_LOST: 617 + return XFS_HEALTHMON_DATALOST; 618 + case FSERR_METADATA: 619 + /* filtered out by xfs_fs_report_error */ 620 + break; 621 + } 622 + 623 + ASSERT(0); 624 + return -1; 625 + } 626 + 627 + /* Add a file io error event to the reporting queue. */ 628 + void 629 + xfs_healthmon_report_file_ioerror( 630 + struct xfs_inode *ip, 631 + const struct fserror_event *p) 632 + { 633 + struct xfs_healthmon_event event = { 634 + .type = file_ioerr_type(p->type), 635 + .domain = XFS_HEALTHMON_FILERANGE, 636 + .fino = ip->i_ino, 637 + .fgen = VFS_I(ip)->i_generation, 638 + .fpos = p->pos, 639 + .flen = p->len, 640 + /* send positive error number to userspace */ 641 + .error = -p->error, 642 + }; 643 + struct xfs_healthmon *hm = xfs_healthmon_get(ip->i_mount); 644 + 645 + if (!hm) 646 + return; 647 + 648 + trace_xfs_healthmon_report_file_ioerror(hm, p); 649 + 650 + xfs_healthmon_push(hm, &event); 651 + xfs_healthmon_put(hm); 652 + } 653 + 654 + static inline void 655 + xfs_healthmon_reset_outbuf( 656 + struct xfs_healthmon *hm) 657 + { 658 + hm->buftail = 0; 659 + hm->bufhead = 0; 660 + } 661 + 662 + struct flags_map { 663 + unsigned int in_mask; 664 + unsigned int out_mask; 665 + }; 666 + 667 + static const struct flags_map shutdown_map[] = { 668 + { SHUTDOWN_META_IO_ERROR, XFS_HEALTH_SHUTDOWN_META_IO_ERROR }, 669 + { SHUTDOWN_LOG_IO_ERROR, XFS_HEALTH_SHUTDOWN_LOG_IO_ERROR }, 670 + { SHUTDOWN_FORCE_UMOUNT, XFS_HEALTH_SHUTDOWN_FORCE_UMOUNT }, 671 + { SHUTDOWN_CORRUPT_INCORE, XFS_HEALTH_SHUTDOWN_CORRUPT_INCORE }, 672 + { SHUTDOWN_CORRUPT_ONDISK, XFS_HEALTH_SHUTDOWN_CORRUPT_ONDISK }, 673 + { SHUTDOWN_DEVICE_REMOVED, XFS_HEALTH_SHUTDOWN_DEVICE_REMOVED }, 674 + }; 675 + 676 + static inline unsigned int 677 + __map_flags( 678 + const struct flags_map *map, 679 + size_t array_len, 680 + unsigned int flags) 681 + { 682 + const struct flags_map *m; 683 + unsigned int ret = 0; 684 + 685 + for (m = map; m < map + array_len; m++) { 686 + if (flags & m->in_mask) 687 + ret |= m->out_mask; 688 + } 689 + 690 + return ret; 691 + } 692 + 693 + #define map_flags(map, flags) __map_flags((map), ARRAY_SIZE(map), (flags)) 694 + 695 + static inline unsigned int shutdown_mask(unsigned int in) 696 + { 697 + return map_flags(shutdown_map, in); 698 + } 699 + 700 + static const unsigned int domain_map[] = { 701 + [XFS_HEALTHMON_MOUNT] = XFS_HEALTH_MONITOR_DOMAIN_MOUNT, 702 + [XFS_HEALTHMON_FS] = XFS_HEALTH_MONITOR_DOMAIN_FS, 703 + [XFS_HEALTHMON_AG] = XFS_HEALTH_MONITOR_DOMAIN_AG, 704 + [XFS_HEALTHMON_INODE] = XFS_HEALTH_MONITOR_DOMAIN_INODE, 705 + [XFS_HEALTHMON_RTGROUP] = XFS_HEALTH_MONITOR_DOMAIN_RTGROUP, 706 + [XFS_HEALTHMON_DATADEV] = XFS_HEALTH_MONITOR_DOMAIN_DATADEV, 707 + [XFS_HEALTHMON_RTDEV] = XFS_HEALTH_MONITOR_DOMAIN_RTDEV, 708 + [XFS_HEALTHMON_LOGDEV] = XFS_HEALTH_MONITOR_DOMAIN_LOGDEV, 709 + [XFS_HEALTHMON_FILERANGE] = XFS_HEALTH_MONITOR_DOMAIN_FILERANGE, 710 + }; 711 + 712 + static const unsigned int type_map[] = { 713 + [XFS_HEALTHMON_RUNNING] = XFS_HEALTH_MONITOR_TYPE_RUNNING, 714 + [XFS_HEALTHMON_LOST] = XFS_HEALTH_MONITOR_TYPE_LOST, 715 + [XFS_HEALTHMON_SICK] = XFS_HEALTH_MONITOR_TYPE_SICK, 716 + [XFS_HEALTHMON_CORRUPT] = XFS_HEALTH_MONITOR_TYPE_CORRUPT, 717 + [XFS_HEALTHMON_HEALTHY] = XFS_HEALTH_MONITOR_TYPE_HEALTHY, 718 + [XFS_HEALTHMON_UNMOUNT] = XFS_HEALTH_MONITOR_TYPE_UNMOUNT, 719 + [XFS_HEALTHMON_SHUTDOWN] = XFS_HEALTH_MONITOR_TYPE_SHUTDOWN, 720 + [XFS_HEALTHMON_MEDIA_ERROR] = XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR, 721 + [XFS_HEALTHMON_BUFREAD] = XFS_HEALTH_MONITOR_TYPE_BUFREAD, 722 + [XFS_HEALTHMON_BUFWRITE] = XFS_HEALTH_MONITOR_TYPE_BUFWRITE, 723 + [XFS_HEALTHMON_DIOREAD] = XFS_HEALTH_MONITOR_TYPE_DIOREAD, 724 + [XFS_HEALTHMON_DIOWRITE] = XFS_HEALTH_MONITOR_TYPE_DIOWRITE, 725 + [XFS_HEALTHMON_DATALOST] = XFS_HEALTH_MONITOR_TYPE_DATALOST, 726 + }; 727 + 728 + /* Render event as a V0 structure */ 729 + STATIC int 730 + xfs_healthmon_format_v0( 731 + struct xfs_healthmon *hm, 732 + const struct xfs_healthmon_event *event) 733 + { 734 + struct xfs_health_monitor_event hme = { 735 + .time_ns = event->time_ns, 736 + }; 737 + 738 + trace_xfs_healthmon_format(hm, event); 739 + 740 + if (event->domain < 0 || event->domain >= ARRAY_SIZE(domain_map) || 741 + event->type < 0 || event->type >= ARRAY_SIZE(type_map)) 742 + return -EFSCORRUPTED; 743 + 744 + hme.domain = domain_map[event->domain]; 745 + hme.type = type_map[event->type]; 746 + 747 + /* fill in the event-specific details */ 748 + switch (event->domain) { 749 + case XFS_HEALTHMON_MOUNT: 750 + switch (event->type) { 751 + case XFS_HEALTHMON_LOST: 752 + hme.e.lost.count = event->lostcount; 753 + break; 754 + case XFS_HEALTHMON_SHUTDOWN: 755 + hme.e.shutdown.reasons = shutdown_mask(event->flags); 756 + break; 757 + default: 758 + break; 759 + } 760 + break; 761 + case XFS_HEALTHMON_FS: 762 + hme.e.fs.mask = xfs_healthmon_fs_mask(event->fsmask); 763 + break; 764 + case XFS_HEALTHMON_RTGROUP: 765 + hme.e.group.mask = xfs_healthmon_rtgroup_mask(event->grpmask); 766 + hme.e.group.gno = event->group; 767 + break; 768 + case XFS_HEALTHMON_AG: 769 + hme.e.group.mask = xfs_healthmon_perag_mask(event->grpmask); 770 + hme.e.group.gno = event->group; 771 + break; 772 + case XFS_HEALTHMON_INODE: 773 + hme.e.inode.mask = xfs_healthmon_inode_mask(event->imask); 774 + hme.e.inode.ino = event->ino; 775 + hme.e.inode.gen = event->gen; 776 + break; 777 + case XFS_HEALTHMON_DATADEV: 778 + case XFS_HEALTHMON_LOGDEV: 779 + case XFS_HEALTHMON_RTDEV: 780 + hme.e.media.daddr = event->daddr; 781 + hme.e.media.bbcount = event->bbcount; 782 + break; 783 + case XFS_HEALTHMON_FILERANGE: 784 + hme.e.filerange.ino = event->fino; 785 + hme.e.filerange.gen = event->fgen; 786 + hme.e.filerange.pos = event->fpos; 787 + hme.e.filerange.len = event->flen; 788 + hme.e.filerange.error = abs(event->error); 789 + break; 790 + default: 791 + break; 792 + } 793 + 794 + ASSERT(hm->bufhead + sizeof(hme) <= hm->bufsize); 795 + 796 + /* copy formatted object to the outbuf */ 797 + if (hm->bufhead + sizeof(hme) <= hm->bufsize) { 798 + memcpy(hm->buffer + hm->bufhead, &hme, sizeof(hme)); 799 + hm->bufhead += sizeof(hme); 800 + } 801 + 802 + return 0; 803 + } 804 + 805 + /* How many bytes are waiting in the outbuf to be copied? */ 806 + static inline size_t 807 + xfs_healthmon_outbuf_bytes( 808 + struct xfs_healthmon *hm) 809 + { 810 + if (hm->bufhead > hm->buftail) 811 + return hm->bufhead - hm->buftail; 812 + return 0; 813 + } 814 + 815 + /* 816 + * Do we have something for userspace to read? This can mean unmount events, 817 + * events pending in the queue, or pending bytes in the outbuf. 818 + */ 819 + static inline bool 820 + xfs_healthmon_has_eventdata( 821 + struct xfs_healthmon *hm) 822 + { 823 + /* 824 + * If the health monitor is already detached from the xfs_mount, we 825 + * want reads to return 0 bytes even if there are no events, because 826 + * userspace interprets that as EOF. If we race with deactivation, 827 + * read_iter will take the necessary locks to discover that there are 828 + * no events to send. 829 + */ 830 + if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) 831 + return true; 832 + 833 + /* 834 + * Either there are events waiting to be formatted into the buffer, or 835 + * there's unread bytes in the buffer. 836 + */ 837 + return hm->events > 0 || xfs_healthmon_outbuf_bytes(hm) > 0; 838 + } 839 + 840 + /* Try to copy the rest of the outbuf to the iov iter. */ 841 + STATIC ssize_t 842 + xfs_healthmon_copybuf( 843 + struct xfs_healthmon *hm, 844 + struct iov_iter *to) 845 + { 846 + size_t to_copy; 847 + size_t w = 0; 848 + 849 + trace_xfs_healthmon_copybuf(hm, to); 850 + 851 + to_copy = xfs_healthmon_outbuf_bytes(hm); 852 + if (to_copy) { 853 + w = copy_to_iter(hm->buffer + hm->buftail, to_copy, to); 854 + if (!w) 855 + return -EFAULT; 856 + 857 + hm->buftail += w; 858 + } 859 + 860 + /* 861 + * Nothing left to copy? Reset the output buffer cursors to the start 862 + * since there's no live data in the buffer. 863 + */ 864 + if (xfs_healthmon_outbuf_bytes(hm) == 0) 865 + xfs_healthmon_reset_outbuf(hm); 866 + return w; 867 + } 868 + 869 + /* 870 + * Return a health monitoring event for formatting into the output buffer if 871 + * there's enough space in the outbuf and an event waiting for us. Caller 872 + * must hold i_rwsem on the healthmon file. 873 + */ 874 + static inline struct xfs_healthmon_event * 875 + xfs_healthmon_format_pop( 876 + struct xfs_healthmon *hm) 877 + { 878 + struct xfs_healthmon_event *event; 879 + 880 + if (hm->bufhead + sizeof(*event) > hm->bufsize) 881 + return NULL; 882 + 883 + mutex_lock(&hm->lock); 884 + event = hm->first_event; 885 + if (event) { 886 + if (hm->last_event == event) 887 + hm->last_event = NULL; 888 + hm->first_event = event->next; 889 + hm->events--; 890 + 891 + trace_xfs_healthmon_pop(hm, event); 892 + } 893 + mutex_unlock(&hm->lock); 894 + return event; 895 + } 896 + 897 + /* Allocate formatting buffer */ 898 + STATIC int 899 + xfs_healthmon_alloc_outbuf( 900 + struct xfs_healthmon *hm, 901 + size_t user_bufsize) 902 + { 903 + void *outbuf; 904 + size_t bufsize = 905 + min(XFS_HEALTHMON_MAX_OUTBUF, max(PAGE_SIZE, user_bufsize)); 906 + 907 + outbuf = kzalloc(bufsize, GFP_KERNEL); 908 + if (!outbuf) { 909 + if (bufsize == PAGE_SIZE) 910 + return -ENOMEM; 911 + 912 + bufsize = PAGE_SIZE; 913 + outbuf = kzalloc(bufsize, GFP_KERNEL); 914 + if (!outbuf) 915 + return -ENOMEM; 916 + } 917 + 918 + hm->buffer = outbuf; 919 + hm->bufsize = bufsize; 920 + hm->bufhead = 0; 921 + hm->buftail = 0; 922 + 923 + return 0; 924 + } 925 + 926 + /* 927 + * Convey queued event data to userspace. First copy any remaining bytes in 928 + * the outbuf, then format the oldest event into the outbuf and copy that too. 929 + */ 930 + STATIC ssize_t 931 + xfs_healthmon_read_iter( 932 + struct kiocb *iocb, 933 + struct iov_iter *to) 934 + { 935 + struct file *file = iocb->ki_filp; 936 + struct inode *inode = file_inode(file); 937 + struct xfs_healthmon *hm = file->private_data; 938 + struct xfs_healthmon_event *event; 939 + size_t copied = 0; 940 + ssize_t ret = 0; 941 + 942 + if (file->f_flags & O_NONBLOCK) { 943 + if (!xfs_healthmon_has_eventdata(hm) || !inode_trylock(inode)) 944 + return -EAGAIN; 945 + } else { 946 + ret = wait_event_interruptible(hm->wait, 947 + xfs_healthmon_has_eventdata(hm)); 948 + if (ret) 949 + return ret; 950 + 951 + inode_lock(inode); 952 + } 953 + 954 + if (hm->bufsize == 0) { 955 + ret = xfs_healthmon_alloc_outbuf(hm, iov_iter_count(to)); 956 + if (ret) 957 + goto out_unlock; 958 + } 959 + 960 + trace_xfs_healthmon_read_start(hm); 961 + 962 + /* 963 + * If there's anything left in the output buffer, copy that before 964 + * formatting more events. 965 + */ 966 + ret = xfs_healthmon_copybuf(hm, to); 967 + if (ret < 0) 968 + goto out_unlock; 969 + copied += ret; 970 + 971 + while (iov_iter_count(to) > 0) { 972 + /* Format the next events into the outbuf until it's full. */ 973 + while ((event = xfs_healthmon_format_pop(hm)) != NULL) { 974 + ret = xfs_healthmon_format_v0(hm, event); 975 + kfree(event); 976 + if (ret) 977 + goto out_unlock; 978 + } 979 + 980 + /* Copy anything formatted into outbuf to userspace */ 981 + ret = xfs_healthmon_copybuf(hm, to); 982 + if (ret <= 0) 983 + break; 984 + 985 + copied += ret; 986 + } 987 + 988 + out_unlock: 989 + trace_xfs_healthmon_read_finish(hm); 990 + inode_unlock(inode); 991 + return copied ?: ret; 992 + } 993 + 994 + /* Poll for available events. */ 995 + STATIC __poll_t 996 + xfs_healthmon_poll( 997 + struct file *file, 998 + struct poll_table_struct *wait) 999 + { 1000 + struct xfs_healthmon *hm = file->private_data; 1001 + __poll_t mask = 0; 1002 + 1003 + poll_wait(file, &hm->wait, wait); 1004 + 1005 + if (xfs_healthmon_has_eventdata(hm)) 1006 + mask |= EPOLLIN; 1007 + return mask; 1008 + } 1009 + 1010 + /* Free the health monitoring information. */ 1011 + STATIC int 1012 + xfs_healthmon_release( 1013 + struct inode *inode, 1014 + struct file *file) 1015 + { 1016 + struct xfs_healthmon *hm = file->private_data; 1017 + 1018 + trace_xfs_healthmon_release(hm); 1019 + 1020 + /* 1021 + * We might be closing the healthmon file before the filesystem 1022 + * unmounts, because userspace processes can terminate at any time and 1023 + * for any reason. Null out xfs_mount::m_healthmon so that another 1024 + * process can create another health monitor file. 1025 + */ 1026 + xfs_healthmon_detach(hm); 1027 + 1028 + /* 1029 + * Wake up any readers that might be left. There shouldn't be any 1030 + * because the only users of the waiter are read and poll. 1031 + */ 1032 + wake_up_all(&hm->wait); 1033 + 1034 + xfs_healthmon_put(hm); 1035 + return 0; 1036 + } 1037 + 1038 + /* Validate ioctl parameters. */ 1039 + static inline bool 1040 + xfs_healthmon_validate( 1041 + const struct xfs_health_monitor *hmo) 1042 + { 1043 + if (hmo->flags & ~XFS_HEALTH_MONITOR_ALL) 1044 + return false; 1045 + if (hmo->format != XFS_HEALTH_MONITOR_FMT_V0) 1046 + return false; 1047 + if (memchr_inv(&hmo->pad, 0, sizeof(hmo->pad))) 1048 + return false; 1049 + return true; 1050 + } 1051 + 1052 + /* Emit some data about the health monitoring fd. */ 1053 + static void 1054 + xfs_healthmon_show_fdinfo( 1055 + struct seq_file *m, 1056 + struct file *file) 1057 + { 1058 + struct xfs_healthmon *hm = file->private_data; 1059 + 1060 + mutex_lock(&hm->lock); 1061 + seq_printf(m, "state:\t%s\ndev:\t%d:%d\nformat:\tv0\nevents:\t%llu\nlost:\t%llu\n", 1062 + hm->mount_cookie == DETACHED_MOUNT_COOKIE ? 1063 + "dead" : "alive", 1064 + MAJOR(hm->dev), MINOR(hm->dev), 1065 + hm->total_events, 1066 + hm->total_lost); 1067 + mutex_unlock(&hm->lock); 1068 + } 1069 + 1070 + /* Reconfigure the health monitor. */ 1071 + STATIC long 1072 + xfs_healthmon_reconfigure( 1073 + struct file *file, 1074 + unsigned int cmd, 1075 + void __user *arg) 1076 + { 1077 + struct xfs_health_monitor hmo; 1078 + struct xfs_healthmon *hm = file->private_data; 1079 + 1080 + if (copy_from_user(&hmo, arg, sizeof(hmo))) 1081 + return -EFAULT; 1082 + 1083 + if (!xfs_healthmon_validate(&hmo)) 1084 + return -EINVAL; 1085 + 1086 + mutex_lock(&hm->lock); 1087 + hm->verbose = !!(hmo.flags & XFS_HEALTH_MONITOR_VERBOSE); 1088 + mutex_unlock(&hm->lock); 1089 + 1090 + return 0; 1091 + } 1092 + 1093 + /* Does the fd point to the same filesystem as the one we're monitoring? */ 1094 + STATIC long 1095 + xfs_healthmon_file_on_monitored_fs( 1096 + struct file *file, 1097 + unsigned int cmd, 1098 + void __user *arg) 1099 + { 1100 + struct xfs_health_file_on_monitored_fs hms; 1101 + struct xfs_healthmon *hm = file->private_data; 1102 + struct inode *hms_inode; 1103 + 1104 + if (copy_from_user(&hms, arg, sizeof(hms))) 1105 + return -EFAULT; 1106 + 1107 + if (hms.flags) 1108 + return -EINVAL; 1109 + 1110 + CLASS(fd, hms_fd)(hms.fd); 1111 + if (fd_empty(hms_fd)) 1112 + return -EBADF; 1113 + 1114 + hms_inode = file_inode(fd_file(hms_fd)); 1115 + mutex_lock(&hm->lock); 1116 + if (hm->mount_cookie != (uintptr_t)hms_inode->i_sb) { 1117 + mutex_unlock(&hm->lock); 1118 + return -ESTALE; 1119 + } 1120 + 1121 + mutex_unlock(&hm->lock); 1122 + return 0; 1123 + } 1124 + 1125 + /* Handle ioctls for the health monitoring thread. */ 1126 + STATIC long 1127 + xfs_healthmon_ioctl( 1128 + struct file *file, 1129 + unsigned int cmd, 1130 + unsigned long p) 1131 + { 1132 + void __user *arg = (void __user *)p; 1133 + 1134 + switch (cmd) { 1135 + case XFS_IOC_HEALTH_MONITOR: 1136 + return xfs_healthmon_reconfigure(file, cmd, arg); 1137 + case XFS_IOC_HEALTH_FD_ON_MONITORED_FS: 1138 + return xfs_healthmon_file_on_monitored_fs(file, cmd, arg); 1139 + default: 1140 + break; 1141 + } 1142 + 1143 + return -ENOTTY; 1144 + } 1145 + 1146 + static const struct file_operations xfs_healthmon_fops = { 1147 + .owner = THIS_MODULE, 1148 + .show_fdinfo = xfs_healthmon_show_fdinfo, 1149 + .read_iter = xfs_healthmon_read_iter, 1150 + .poll = xfs_healthmon_poll, 1151 + .release = xfs_healthmon_release, 1152 + .unlocked_ioctl = xfs_healthmon_ioctl, 1153 + }; 1154 + 1155 + /* 1156 + * Create a health monitoring file. Returns an index to the fd table or a 1157 + * negative errno. 1158 + */ 1159 + long 1160 + xfs_ioc_health_monitor( 1161 + struct file *file, 1162 + struct xfs_health_monitor __user *arg) 1163 + { 1164 + struct xfs_health_monitor hmo; 1165 + struct xfs_healthmon_event *running_event; 1166 + struct xfs_healthmon *hm; 1167 + struct xfs_inode *ip = XFS_I(file_inode(file)); 1168 + struct xfs_mount *mp = ip->i_mount; 1169 + int ret; 1170 + 1171 + /* 1172 + * The only intended user of the health monitoring system should be the 1173 + * xfs_healer daemon running on behalf of the whole filesystem in the 1174 + * initial user namespace. IOWs, we don't allow unprivileged userspace 1175 + * (they can use fsnotify) nor do we allow containers. 1176 + */ 1177 + if (!capable(CAP_SYS_ADMIN)) 1178 + return -EPERM; 1179 + if (ip->i_ino != mp->m_sb.sb_rootino) 1180 + return -EPERM; 1181 + if (current_user_ns() != &init_user_ns) 1182 + return -EPERM; 1183 + 1184 + if (copy_from_user(&hmo, arg, sizeof(hmo))) 1185 + return -EFAULT; 1186 + 1187 + if (!xfs_healthmon_validate(&hmo)) 1188 + return -EINVAL; 1189 + 1190 + hm = kzalloc(sizeof(*hm), GFP_KERNEL); 1191 + if (!hm) 1192 + return -ENOMEM; 1193 + hm->dev = mp->m_super->s_dev; 1194 + refcount_set(&hm->ref, 1); 1195 + 1196 + mutex_init(&hm->lock); 1197 + init_waitqueue_head(&hm->wait); 1198 + 1199 + if (hmo.flags & XFS_HEALTH_MONITOR_VERBOSE) 1200 + hm->verbose = true; 1201 + 1202 + /* Queue up the first event that lets the client know we're running. */ 1203 + running_event = kzalloc(sizeof(struct xfs_healthmon_event), GFP_NOFS); 1204 + if (!running_event) { 1205 + ret = -ENOMEM; 1206 + goto out_hm; 1207 + } 1208 + running_event->type = XFS_HEALTHMON_RUNNING; 1209 + running_event->domain = XFS_HEALTHMON_MOUNT; 1210 + __xfs_healthmon_insert(hm, running_event); 1211 + 1212 + /* 1213 + * Preallocate the unmount event so that we can't fail to notify the 1214 + * filesystem later. This is key for triggering fast exit of the 1215 + * xfs_healer daemon. 1216 + */ 1217 + hm->unmount_event = kzalloc(sizeof(struct xfs_healthmon_event), 1218 + GFP_NOFS); 1219 + if (!hm->unmount_event) { 1220 + ret = -ENOMEM; 1221 + goto out_hm; 1222 + } 1223 + hm->unmount_event->type = XFS_HEALTHMON_UNMOUNT; 1224 + hm->unmount_event->domain = XFS_HEALTHMON_MOUNT; 1225 + 1226 + /* 1227 + * Try to attach this health monitor to the xfs_mount. The monitor is 1228 + * considered live and will receive events if this succeeds. 1229 + */ 1230 + ret = xfs_healthmon_attach(mp, hm); 1231 + if (ret) 1232 + goto out_hm; 1233 + 1234 + /* 1235 + * Create the anonymous file and install a fd for it. If it succeeds, 1236 + * the file owns hm and can go away at any time, so we must not access 1237 + * it again. This must go last because we can't undo a fd table 1238 + * installation. 1239 + */ 1240 + ret = anon_inode_getfd("xfs_healthmon", &xfs_healthmon_fops, hm, 1241 + O_CLOEXEC | O_RDONLY); 1242 + if (ret < 0) 1243 + goto out_mp; 1244 + 1245 + trace_xfs_healthmon_create(mp->m_super->s_dev, hmo.flags, hmo.format); 1246 + 1247 + return ret; 1248 + 1249 + out_mp: 1250 + xfs_healthmon_detach(hm); 1251 + out_hm: 1252 + ASSERT(refcount_read(&hm->ref) == 1); 1253 + xfs_healthmon_put(hm); 1254 + return ret; 1255 + }

+184

fs/xfs/xfs_healthmon.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Copyright (c) 2024-2026 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <djwong@kernel.org> 5 + */ 6 + #ifndef __XFS_HEALTHMON_H__ 7 + #define __XFS_HEALTHMON_H__ 8 + 9 + struct xfs_healthmon { 10 + /* 11 + * Weak reference to the xfs filesystem that is being monitored. It 12 + * will be set to zero when the filesystem detaches from the monitor. 13 + * Do not dereference this pointer. 14 + */ 15 + uintptr_t mount_cookie; 16 + 17 + /* 18 + * Device number of the filesystem being monitored. This is for 19 + * consistent tracing even after unmount. 20 + */ 21 + dev_t dev; 22 + 23 + /* 24 + * Reference count of this structure. The open healthmon fd holds one 25 + * ref, the xfs_mount holds another ref if it points to this object, 26 + * and running event handlers hold their own refs. 27 + */ 28 + refcount_t ref; 29 + 30 + /* lock for event list and event counters */ 31 + struct mutex lock; 32 + 33 + /* list of event objects */ 34 + struct xfs_healthmon_event *first_event; 35 + struct xfs_healthmon_event *last_event; 36 + 37 + /* preallocated event for unmount */ 38 + struct xfs_healthmon_event *unmount_event; 39 + 40 + /* number of events in the list */ 41 + unsigned int events; 42 + 43 + /* do we want all events? */ 44 + bool verbose:1; 45 + 46 + /* waiter so read/poll can sleep until the arrival of events */ 47 + struct wait_queue_head wait; 48 + 49 + /* 50 + * Buffer for formatting events for a read_iter call. Events are 51 + * formatted into the buffer at bufhead, and buftail determines where 52 + * to start a copy_iter to get those events to userspace. All buffer 53 + * fields are protected by inode_lock. 54 + */ 55 + char *buffer; 56 + size_t bufsize; 57 + size_t bufhead; 58 + size_t buftail; 59 + 60 + /* did we lose previous events? */ 61 + unsigned long long lost_prev_event; 62 + 63 + /* total counts of events observed and lost events */ 64 + unsigned long long total_events; 65 + unsigned long long total_lost; 66 + }; 67 + 68 + void xfs_healthmon_unmount(struct xfs_mount *mp); 69 + 70 + enum xfs_healthmon_type { 71 + XFS_HEALTHMON_RUNNING, /* monitor running */ 72 + XFS_HEALTHMON_LOST, /* message lost */ 73 + XFS_HEALTHMON_UNMOUNT, /* filesystem is unmounting */ 74 + 75 + /* filesystem shutdown */ 76 + XFS_HEALTHMON_SHUTDOWN, 77 + 78 + /* metadata health events */ 79 + XFS_HEALTHMON_SICK, /* runtime corruption observed */ 80 + XFS_HEALTHMON_CORRUPT, /* fsck reported corruption */ 81 + XFS_HEALTHMON_HEALTHY, /* fsck reported healthy structure */ 82 + 83 + /* media errors */ 84 + XFS_HEALTHMON_MEDIA_ERROR, 85 + 86 + /* file range events */ 87 + XFS_HEALTHMON_BUFREAD, 88 + XFS_HEALTHMON_BUFWRITE, 89 + XFS_HEALTHMON_DIOREAD, 90 + XFS_HEALTHMON_DIOWRITE, 91 + XFS_HEALTHMON_DATALOST, 92 + }; 93 + 94 + enum xfs_healthmon_domain { 95 + XFS_HEALTHMON_MOUNT, /* affects the whole fs */ 96 + 97 + /* metadata health events */ 98 + XFS_HEALTHMON_FS, /* main filesystem metadata */ 99 + XFS_HEALTHMON_AG, /* allocation group metadata */ 100 + XFS_HEALTHMON_INODE, /* inode metadata */ 101 + XFS_HEALTHMON_RTGROUP, /* realtime group metadata */ 102 + 103 + /* media errors */ 104 + XFS_HEALTHMON_DATADEV, 105 + XFS_HEALTHMON_RTDEV, 106 + XFS_HEALTHMON_LOGDEV, 107 + 108 + /* file range events */ 109 + XFS_HEALTHMON_FILERANGE, 110 + }; 111 + 112 + struct xfs_healthmon_event { 113 + struct xfs_healthmon_event *next; 114 + 115 + enum xfs_healthmon_type type; 116 + enum xfs_healthmon_domain domain; 117 + 118 + uint64_t time_ns; 119 + 120 + union { 121 + /* lost events */ 122 + struct { 123 + uint64_t lostcount; 124 + }; 125 + /* fs/rt metadata */ 126 + struct { 127 + /* XFS_SICK_* flags */ 128 + unsigned int fsmask; 129 + }; 130 + /* ag/rtgroup metadata */ 131 + struct { 132 + /* XFS_SICK_(AG|RG)* flags */ 133 + unsigned int grpmask; 134 + unsigned int group; 135 + }; 136 + /* inode metadata */ 137 + struct { 138 + /* XFS_SICK_INO_* flags */ 139 + unsigned int imask; 140 + uint32_t gen; 141 + xfs_ino_t ino; 142 + }; 143 + /* shutdown */ 144 + struct { 145 + unsigned int flags; 146 + }; 147 + /* media errors */ 148 + struct { 149 + xfs_daddr_t daddr; 150 + uint64_t bbcount; 151 + }; 152 + /* file range events */ 153 + struct { 154 + xfs_ino_t fino; 155 + loff_t fpos; 156 + uint64_t flen; 157 + uint32_t fgen; 158 + int error; 159 + }; 160 + }; 161 + }; 162 + 163 + void xfs_healthmon_report_fs(struct xfs_mount *mp, 164 + enum xfs_healthmon_type type, unsigned int old_mask, 165 + unsigned int new_mask); 166 + void xfs_healthmon_report_group(struct xfs_group *xg, 167 + enum xfs_healthmon_type type, unsigned int old_mask, 168 + unsigned int new_mask); 169 + void xfs_healthmon_report_inode(struct xfs_inode *ip, 170 + enum xfs_healthmon_type type, unsigned int old_mask, 171 + unsigned int new_mask); 172 + 173 + void xfs_healthmon_report_shutdown(struct xfs_mount *mp, uint32_t flags); 174 + 175 + void xfs_healthmon_report_media(struct xfs_mount *mp, enum xfs_device fdev, 176 + xfs_daddr_t daddr, uint64_t bbcount); 177 + 178 + void xfs_healthmon_report_file_ioerror(struct xfs_inode *ip, 179 + const struct fserror_event *p); 180 + 181 + long xfs_ioc_health_monitor(struct file *file, 182 + struct xfs_health_monitor __user *arg); 183 + 184 + #endif /* __XFS_HEALTHMON_H__ */

+7

fs/xfs/xfs_ioctl.c

··· 41 41 #include "xfs_exchrange.h" 42 42 #include "xfs_handle.h" 43 43 #include "xfs_rtgroup.h" 44 + #include "xfs_healthmon.h" 45 + #include "xfs_verify_media.h" 44 46 45 47 #include <linux/mount.h> 46 48 #include <linux/fileattr.h> ··· 1420 1418 return xfs_ioc_start_commit(filp, arg); 1421 1419 case XFS_IOC_COMMIT_RANGE: 1422 1420 return xfs_ioc_commit_range(filp, arg); 1421 + 1422 + case XFS_IOC_HEALTH_MONITOR: 1423 + return xfs_ioc_health_monitor(filp, arg); 1424 + case XFS_IOC_VERIFY_MEDIA: 1425 + return xfs_ioc_verify_media(filp, arg); 1423 1426 1424 1427 default: 1425 1428 return -ENOTTY;

+2

fs/xfs/xfs_mount.c

··· 41 41 #include "xfs_rtrefcount_btree.h" 42 42 #include "scrub/stats.h" 43 43 #include "xfs_zone_alloc.h" 44 + #include "xfs_healthmon.h" 44 45 45 46 static DEFINE_MUTEX(xfs_uuid_table_mutex); 46 47 static int xfs_uuid_table_size; ··· 626 625 cancel_delayed_work_sync(&mp->m_reclaim_work); 627 626 xfs_reclaim_inodes(mp); 628 627 xfs_health_unmount(mp); 628 + xfs_healthmon_unmount(mp); 629 629 } 630 630 631 631 static void

+4

fs/xfs/xfs_mount.h

··· 13 13 struct xfs_quotainfo; 14 14 struct xfs_da_geometry; 15 15 struct xfs_perag; 16 + struct xfs_healthmon; 16 17 17 18 /* dynamic preallocation free space thresholds, 5% down to 1% */ 18 19 enum { ··· 343 342 344 343 /* Hook to feed dirent updates to an active online repair. */ 345 344 struct xfs_hooks m_dir_update_hooks; 345 + 346 + /* Private data referring to a health monitor object. */ 347 + struct xfs_healthmon *m_healthmon; 346 348 } xfs_mount_t; 347 349 348 350 #define M_IGEO(mp) (&(mp)->m_ino_geo)

+16 -5

fs/xfs/xfs_notify_failure.c

··· 22 22 #include "xfs_notify_failure.h" 23 23 #include "xfs_rtgroup.h" 24 24 #include "xfs_rtrmap_btree.h" 25 + #include "xfs_healthmon.h" 25 26 26 27 #include <linux/mm.h> 27 28 #include <linux/dax.h> 28 29 #include <linux/fs.h> 30 + #include <linux/fserror.h> 29 31 30 32 struct xfs_failure_info { 31 33 xfs_agblock_t startblock; ··· 117 115 if (notify->mf_flags & MF_MEM_PRE_REMOVE) 118 116 invalidate_inode_pages2_range(mapping, pgoff, 119 117 pgoff + pgcnt - 1); 118 + 119 + fserror_report_data_lost(VFS_I(ip), (u64)pgoff << PAGE_SHIFT, 120 + (u64)pgcnt << PAGE_SHIFT, GFP_NOFS); 120 121 121 122 xfs_irele(ip); 122 123 return error; ··· 220 215 if (error) 221 216 return error; 222 217 218 + xfs_healthmon_report_media(mp, XFS_DEV_LOG, daddr, bblen); 219 + 223 220 /* 224 221 * In the pre-remove case the failure notification is attempting to 225 222 * trigger a force unmount. The expectation is that the device is ··· 255 248 uint64_t bblen; 256 249 struct xfs_group *xg = NULL; 257 250 258 - if (!xfs_has_rmapbt(mp)) { 259 - xfs_debug(mp, "notify_failure() needs rmapbt enabled!"); 260 - return -EOPNOTSUPP; 261 - } 262 - 263 251 error = xfs_dax_translate_range(xfs_group_type_buftarg(mp, type), 264 252 offset, len, &daddr, &bblen); 265 253 if (error) 266 254 return error; 255 + 256 + xfs_healthmon_report_media(mp, 257 + type == XG_TYPE_RTG ? XFS_DEV_RT : XFS_DEV_DATA, 258 + daddr, bblen); 259 + 260 + if (!xfs_has_rmapbt(mp)) { 261 + xfs_debug(mp, "notify_failure() needs rmapbt enabled!"); 262 + return -EOPNOTSUPP; 263 + } 267 264 268 265 if (type == XG_TYPE_RTG) { 269 266 start_bno = xfs_daddr_to_rtb(mp, daddr);

-2

fs/xfs/xfs_platform.h

··· 133 133 134 134 #define ENOATTR ENODATA /* Attribute not found */ 135 135 #define EWRONGFS EINVAL /* Mount with wrong filesystem type */ 136 - #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 137 - #define EFSBADCRC EBADMSG /* Bad CRC detected */ 138 136 139 137 #define __return_address __builtin_return_address(0) 140 138

+12

fs/xfs/xfs_super.c

··· 47 47 #include "xfs_parent.h" 48 48 #include "xfs_rtalloc.h" 49 49 #include "xfs_zone_alloc.h" 50 + #include "xfs_healthmon.h" 50 51 #include "scrub/stats.h" 51 52 #include "scrub/rcbag_btree.h" 52 53 53 54 #include <linux/magic.h> 54 55 #include <linux/fs_context.h> 55 56 #include <linux/fs_parser.h> 57 + #include <linux/fserror.h> 56 58 57 59 static const struct super_operations xfs_super_operations; 58 60 ··· 1303 1301 return 0; 1304 1302 } 1305 1303 1304 + static void 1305 + xfs_fs_report_error( 1306 + const struct fserror_event *event) 1307 + { 1308 + /* healthmon already knows about non-inode and metadata errors */ 1309 + if (event->inode && event->type != FSERR_METADATA) 1310 + xfs_healthmon_report_file_ioerror(XFS_I(event->inode), event); 1311 + } 1312 + 1306 1313 static const struct super_operations xfs_super_operations = { 1307 1314 .alloc_inode = xfs_fs_alloc_inode, 1308 1315 .destroy_inode = xfs_fs_destroy_inode, ··· 1328 1317 .free_cached_objects = xfs_fs_free_cached_objects, 1329 1318 .shutdown = xfs_fs_shutdown, 1330 1319 .show_stats = xfs_fs_show_stats, 1320 + .report_error = xfs_fs_report_error, 1331 1321 }; 1332 1322 1333 1323 static int

+5

fs/xfs/xfs_trace.c

··· 51 51 #include "xfs_rtgroup.h" 52 52 #include "xfs_zone_alloc.h" 53 53 #include "xfs_zone_priv.h" 54 + #include "xfs_health.h" 55 + #include "xfs_healthmon.h" 56 + #include "xfs_notify_failure.h" 57 + #include "xfs_file.h" 58 + #include <linux/fserror.h> 54 59 55 60 /* 56 61 * We include this last to have the helpers above available for the trace

+512

fs/xfs/xfs_trace.h

··· 103 103 struct xfs_metadir_update; 104 104 struct xfs_rtgroup; 105 105 struct xfs_open_zone; 106 + struct xfs_healthmon_event; 107 + struct xfs_healthmon; 108 + struct fserror_event; 106 109 107 110 #define XFS_ATTR_FILTER_FLAGS \ 108 111 { XFS_ATTR_ROOT, "ROOT" }, \ ··· 5908 5905 TP_ARGS(mp, ctr, delta, caller_ip)) 5909 5906 DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_reserved); 5910 5907 DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_enospc); 5908 + 5909 + TRACE_EVENT(xfs_healthmon_lost_event, 5910 + TP_PROTO(const struct xfs_healthmon *hm), 5911 + TP_ARGS(hm), 5912 + TP_STRUCT__entry( 5913 + __field(dev_t, dev) 5914 + __field(unsigned long long, lost_prev) 5915 + ), 5916 + TP_fast_assign( 5917 + __entry->dev = hm->dev; 5918 + __entry->lost_prev = hm->lost_prev_event; 5919 + ), 5920 + TP_printk("dev %d:%d lost_prev %llu", 5921 + MAJOR(__entry->dev), MINOR(__entry->dev), 5922 + __entry->lost_prev) 5923 + ); 5924 + 5925 + #define XFS_HEALTHMON_FLAGS_STRINGS \ 5926 + { XFS_HEALTH_MONITOR_VERBOSE, "verbose" } 5927 + #define XFS_HEALTHMON_FMT_STRINGS \ 5928 + { XFS_HEALTH_MONITOR_FMT_V0, "v0" } 5929 + 5930 + TRACE_EVENT(xfs_healthmon_create, 5931 + TP_PROTO(dev_t dev, u64 flags, u8 format), 5932 + TP_ARGS(dev, flags, format), 5933 + TP_STRUCT__entry( 5934 + __field(dev_t, dev) 5935 + __field(u64, flags) 5936 + __field(u8, format) 5937 + ), 5938 + TP_fast_assign( 5939 + __entry->dev = dev; 5940 + __entry->flags = flags; 5941 + __entry->format = format; 5942 + ), 5943 + TP_printk("dev %d:%d flags %s format %s", 5944 + MAJOR(__entry->dev), MINOR(__entry->dev), 5945 + __print_flags(__entry->flags, "|", XFS_HEALTHMON_FLAGS_STRINGS), 5946 + __print_symbolic(__entry->format, XFS_HEALTHMON_FMT_STRINGS)) 5947 + ); 5948 + 5949 + TRACE_EVENT(xfs_healthmon_copybuf, 5950 + TP_PROTO(const struct xfs_healthmon *hm, const struct iov_iter *iov), 5951 + TP_ARGS(hm, iov), 5952 + TP_STRUCT__entry( 5953 + __field(dev_t, dev) 5954 + __field(size_t, bufsize) 5955 + __field(size_t, inpos) 5956 + __field(size_t, outpos) 5957 + __field(size_t, to_copy) 5958 + __field(size_t, iter_count) 5959 + ), 5960 + TP_fast_assign( 5961 + __entry->dev = hm->dev; 5962 + __entry->bufsize = hm->bufsize; 5963 + __entry->inpos = hm->bufhead; 5964 + __entry->outpos = hm->buftail; 5965 + if (hm->bufhead > hm->buftail) 5966 + __entry->to_copy = hm->bufhead - hm->buftail; 5967 + else 5968 + __entry->to_copy = 0; 5969 + __entry->iter_count = iov_iter_count(iov); 5970 + ), 5971 + TP_printk("dev %d:%d bufsize %zu in_pos %zu out_pos %zu to_copy %zu iter_count %zu", 5972 + MAJOR(__entry->dev), MINOR(__entry->dev), 5973 + __entry->bufsize, 5974 + __entry->inpos, 5975 + __entry->outpos, 5976 + __entry->to_copy, 5977 + __entry->iter_count) 5978 + ); 5979 + 5980 + DECLARE_EVENT_CLASS(xfs_healthmon_class, 5981 + TP_PROTO(const struct xfs_healthmon *hm), 5982 + TP_ARGS(hm), 5983 + TP_STRUCT__entry( 5984 + __field(dev_t, dev) 5985 + __field(unsigned int, events) 5986 + __field(unsigned long long, lost_prev) 5987 + ), 5988 + TP_fast_assign( 5989 + __entry->dev = hm->dev; 5990 + __entry->events = hm->events; 5991 + __entry->lost_prev = hm->lost_prev_event; 5992 + ), 5993 + TP_printk("dev %d:%d events %u lost_prev? %llu", 5994 + MAJOR(__entry->dev), MINOR(__entry->dev), 5995 + __entry->events, 5996 + __entry->lost_prev) 5997 + ); 5998 + #define DEFINE_HEALTHMON_EVENT(name) \ 5999 + DEFINE_EVENT(xfs_healthmon_class, name, \ 6000 + TP_PROTO(const struct xfs_healthmon *hm), \ 6001 + TP_ARGS(hm)) 6002 + DEFINE_HEALTHMON_EVENT(xfs_healthmon_read_start); 6003 + DEFINE_HEALTHMON_EVENT(xfs_healthmon_read_finish); 6004 + DEFINE_HEALTHMON_EVENT(xfs_healthmon_release); 6005 + DEFINE_HEALTHMON_EVENT(xfs_healthmon_detach); 6006 + DEFINE_HEALTHMON_EVENT(xfs_healthmon_report_unmount); 6007 + 6008 + #define XFS_HEALTHMON_TYPE_STRINGS \ 6009 + { XFS_HEALTHMON_LOST, "lost" }, \ 6010 + { XFS_HEALTHMON_UNMOUNT, "unmount" }, \ 6011 + { XFS_HEALTHMON_SICK, "sick" }, \ 6012 + { XFS_HEALTHMON_CORRUPT, "corrupt" }, \ 6013 + { XFS_HEALTHMON_HEALTHY, "healthy" }, \ 6014 + { XFS_HEALTHMON_SHUTDOWN, "shutdown" } 6015 + 6016 + #define XFS_HEALTHMON_DOMAIN_STRINGS \ 6017 + { XFS_HEALTHMON_MOUNT, "mount" }, \ 6018 + { XFS_HEALTHMON_FS, "fs" }, \ 6019 + { XFS_HEALTHMON_AG, "ag" }, \ 6020 + { XFS_HEALTHMON_INODE, "inode" }, \ 6021 + { XFS_HEALTHMON_RTGROUP, "rtgroup" } 6022 + 6023 + TRACE_DEFINE_ENUM(XFS_HEALTHMON_LOST); 6024 + TRACE_DEFINE_ENUM(XFS_HEALTHMON_SHUTDOWN); 6025 + TRACE_DEFINE_ENUM(XFS_HEALTHMON_UNMOUNT); 6026 + TRACE_DEFINE_ENUM(XFS_HEALTHMON_SICK); 6027 + TRACE_DEFINE_ENUM(XFS_HEALTHMON_CORRUPT); 6028 + TRACE_DEFINE_ENUM(XFS_HEALTHMON_HEALTHY); 6029 + 6030 + TRACE_DEFINE_ENUM(XFS_HEALTHMON_MOUNT); 6031 + TRACE_DEFINE_ENUM(XFS_HEALTHMON_FS); 6032 + TRACE_DEFINE_ENUM(XFS_HEALTHMON_AG); 6033 + TRACE_DEFINE_ENUM(XFS_HEALTHMON_INODE); 6034 + TRACE_DEFINE_ENUM(XFS_HEALTHMON_RTGROUP); 6035 + 6036 + DECLARE_EVENT_CLASS(xfs_healthmon_event_class, 6037 + TP_PROTO(const struct xfs_healthmon *hm, 6038 + const struct xfs_healthmon_event *event), 6039 + TP_ARGS(hm, event), 6040 + TP_STRUCT__entry( 6041 + __field(dev_t, dev) 6042 + __field(unsigned int, type) 6043 + __field(unsigned int, domain) 6044 + __field(unsigned int, mask) 6045 + __field(unsigned long long, ino) 6046 + __field(unsigned int, gen) 6047 + __field(unsigned int, group) 6048 + __field(unsigned long long, offset) 6049 + __field(unsigned long long, length) 6050 + __field(unsigned long long, lostcount) 6051 + ), 6052 + TP_fast_assign( 6053 + __entry->dev = hm->dev; 6054 + __entry->type = event->type; 6055 + __entry->domain = event->domain; 6056 + __entry->mask = 0; 6057 + __entry->group = 0; 6058 + __entry->ino = 0; 6059 + __entry->gen = 0; 6060 + __entry->offset = 0; 6061 + __entry->length = 0; 6062 + __entry->lostcount = 0; 6063 + switch (__entry->domain) { 6064 + case XFS_HEALTHMON_MOUNT: 6065 + switch (__entry->type) { 6066 + case XFS_HEALTHMON_SHUTDOWN: 6067 + __entry->mask = event->flags; 6068 + break; 6069 + case XFS_HEALTHMON_LOST: 6070 + __entry->lostcount = event->lostcount; 6071 + break; 6072 + } 6073 + break; 6074 + case XFS_HEALTHMON_FS: 6075 + __entry->mask = event->fsmask; 6076 + break; 6077 + case XFS_HEALTHMON_AG: 6078 + case XFS_HEALTHMON_RTGROUP: 6079 + __entry->mask = event->grpmask; 6080 + __entry->group = event->group; 6081 + break; 6082 + case XFS_HEALTHMON_INODE: 6083 + __entry->mask = event->imask; 6084 + __entry->ino = event->ino; 6085 + __entry->gen = event->gen; 6086 + break; 6087 + case XFS_HEALTHMON_DATADEV: 6088 + case XFS_HEALTHMON_LOGDEV: 6089 + case XFS_HEALTHMON_RTDEV: 6090 + __entry->offset = event->daddr; 6091 + __entry->length = event->bbcount; 6092 + break; 6093 + case XFS_HEALTHMON_FILERANGE: 6094 + __entry->ino = event->fino; 6095 + __entry->gen = event->fgen; 6096 + __entry->offset = event->fpos; 6097 + __entry->length = event->flen; 6098 + break; 6099 + } 6100 + ), 6101 + TP_printk("dev %d:%d type %s domain %s mask 0x%x ino 0x%llx gen 0x%x offset 0x%llx len 0x%llx group 0x%x lost %llu", 6102 + MAJOR(__entry->dev), MINOR(__entry->dev), 6103 + __print_symbolic(__entry->type, XFS_HEALTHMON_TYPE_STRINGS), 6104 + __print_symbolic(__entry->domain, XFS_HEALTHMON_DOMAIN_STRINGS), 6105 + __entry->mask, 6106 + __entry->ino, 6107 + __entry->gen, 6108 + __entry->offset, 6109 + __entry->length, 6110 + __entry->group, 6111 + __entry->lostcount) 6112 + ); 6113 + #define DEFINE_HEALTHMONEVENT_EVENT(name) \ 6114 + DEFINE_EVENT(xfs_healthmon_event_class, name, \ 6115 + TP_PROTO(const struct xfs_healthmon *hm, \ 6116 + const struct xfs_healthmon_event *event), \ 6117 + TP_ARGS(hm, event)) 6118 + DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_insert); 6119 + DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_push); 6120 + DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_pop); 6121 + DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_format); 6122 + DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_format_overflow); 6123 + DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_drop); 6124 + DEFINE_HEALTHMONEVENT_EVENT(xfs_healthmon_merge); 6125 + 6126 + TRACE_EVENT(xfs_healthmon_report_fs, 6127 + TP_PROTO(const struct xfs_healthmon *hm, 6128 + unsigned int old_mask, unsigned int new_mask, 6129 + const struct xfs_healthmon_event *event), 6130 + TP_ARGS(hm, old_mask, new_mask, event), 6131 + TP_STRUCT__entry( 6132 + __field(dev_t, dev) 6133 + __field(unsigned int, type) 6134 + __field(unsigned int, domain) 6135 + __field(unsigned int, old_mask) 6136 + __field(unsigned int, new_mask) 6137 + __field(unsigned int, fsmask) 6138 + ), 6139 + TP_fast_assign( 6140 + __entry->dev = hm->dev; 6141 + __entry->type = event->type; 6142 + __entry->domain = event->domain; 6143 + __entry->old_mask = old_mask; 6144 + __entry->new_mask = new_mask; 6145 + __entry->fsmask = event->fsmask; 6146 + ), 6147 + TP_printk("dev %d:%d type %s domain %s oldmask 0x%x newmask 0x%x fsmask 0x%x", 6148 + MAJOR(__entry->dev), MINOR(__entry->dev), 6149 + __print_symbolic(__entry->type, XFS_HEALTHMON_TYPE_STRINGS), 6150 + __print_symbolic(__entry->domain, XFS_HEALTHMON_DOMAIN_STRINGS), 6151 + __entry->old_mask, 6152 + __entry->new_mask, 6153 + __entry->fsmask) 6154 + ); 6155 + 6156 + TRACE_EVENT(xfs_healthmon_report_group, 6157 + TP_PROTO(const struct xfs_healthmon *hm, 6158 + unsigned int old_mask, unsigned int new_mask, 6159 + const struct xfs_healthmon_event *event), 6160 + TP_ARGS(hm, old_mask, new_mask, event), 6161 + TP_STRUCT__entry( 6162 + __field(dev_t, dev) 6163 + __field(unsigned int, type) 6164 + __field(unsigned int, domain) 6165 + __field(unsigned int, old_mask) 6166 + __field(unsigned int, new_mask) 6167 + __field(unsigned int, grpmask) 6168 + __field(unsigned int, group) 6169 + ), 6170 + TP_fast_assign( 6171 + __entry->dev = hm->dev; 6172 + __entry->type = event->type; 6173 + __entry->domain = event->domain; 6174 + __entry->old_mask = old_mask; 6175 + __entry->new_mask = new_mask; 6176 + __entry->grpmask = event->grpmask; 6177 + __entry->group = event->group; 6178 + ), 6179 + TP_printk("dev %d:%d type %s domain %s oldmask 0x%x newmask 0x%x grpmask 0x%x group 0x%x", 6180 + MAJOR(__entry->dev), MINOR(__entry->dev), 6181 + __print_symbolic(__entry->type, XFS_HEALTHMON_TYPE_STRINGS), 6182 + __print_symbolic(__entry->domain, XFS_HEALTHMON_DOMAIN_STRINGS), 6183 + __entry->old_mask, 6184 + __entry->new_mask, 6185 + __entry->grpmask, 6186 + __entry->group) 6187 + ); 6188 + 6189 + TRACE_EVENT(xfs_healthmon_report_inode, 6190 + TP_PROTO(const struct xfs_healthmon *hm, 6191 + unsigned int old_mask, unsigned int new_mask, 6192 + const struct xfs_healthmon_event *event), 6193 + TP_ARGS(hm, old_mask, new_mask, event), 6194 + TP_STRUCT__entry( 6195 + __field(dev_t, dev) 6196 + __field(unsigned int, type) 6197 + __field(unsigned int, domain) 6198 + __field(unsigned int, old_mask) 6199 + __field(unsigned int, new_mask) 6200 + __field(unsigned int, imask) 6201 + __field(unsigned long long, ino) 6202 + __field(unsigned int, gen) 6203 + ), 6204 + TP_fast_assign( 6205 + __entry->dev = hm->dev; 6206 + __entry->type = event->type; 6207 + __entry->domain = event->domain; 6208 + __entry->old_mask = old_mask; 6209 + __entry->new_mask = new_mask; 6210 + __entry->imask = event->imask; 6211 + __entry->ino = event->ino; 6212 + __entry->gen = event->gen; 6213 + ), 6214 + TP_printk("dev %d:%d type %s domain %s oldmask 0x%x newmask 0x%x imask 0x%x ino 0x%llx gen 0x%x", 6215 + MAJOR(__entry->dev), MINOR(__entry->dev), 6216 + __print_symbolic(__entry->type, XFS_HEALTHMON_TYPE_STRINGS), 6217 + __print_symbolic(__entry->domain, XFS_HEALTHMON_DOMAIN_STRINGS), 6218 + __entry->old_mask, 6219 + __entry->new_mask, 6220 + __entry->imask, 6221 + __entry->ino, 6222 + __entry->gen) 6223 + ); 6224 + 6225 + TRACE_EVENT(xfs_healthmon_report_shutdown, 6226 + TP_PROTO(const struct xfs_healthmon *hm, uint32_t shutdown_flags), 6227 + TP_ARGS(hm, shutdown_flags), 6228 + TP_STRUCT__entry( 6229 + __field(dev_t, dev) 6230 + __field(uint32_t, shutdown_flags) 6231 + ), 6232 + TP_fast_assign( 6233 + __entry->dev = hm->dev; 6234 + __entry->shutdown_flags = shutdown_flags; 6235 + ), 6236 + TP_printk("dev %d:%d shutdown_flags %s", 6237 + MAJOR(__entry->dev), MINOR(__entry->dev), 6238 + __print_flags(__entry->shutdown_flags, "|", XFS_SHUTDOWN_STRINGS)) 6239 + ); 6240 + 6241 + #define XFS_DEVICE_STRINGS \ 6242 + { XFS_DEV_DATA, "datadev" }, \ 6243 + { XFS_DEV_RT, "rtdev" }, \ 6244 + { XFS_DEV_LOG, "logdev" } 6245 + 6246 + TRACE_DEFINE_ENUM(XFS_DEV_DATA); 6247 + TRACE_DEFINE_ENUM(XFS_DEV_RT); 6248 + TRACE_DEFINE_ENUM(XFS_DEV_LOG); 6249 + 6250 + TRACE_EVENT(xfs_healthmon_report_media, 6251 + TP_PROTO(const struct xfs_healthmon *hm, enum xfs_device fdev, 6252 + const struct xfs_healthmon_event *event), 6253 + TP_ARGS(hm, fdev, event), 6254 + TP_STRUCT__entry( 6255 + __field(dev_t, dev) 6256 + __field(unsigned int, error_dev) 6257 + __field(uint64_t, daddr) 6258 + __field(uint64_t, bbcount) 6259 + ), 6260 + TP_fast_assign( 6261 + __entry->dev = hm->dev; 6262 + __entry->error_dev = fdev; 6263 + __entry->daddr = event->daddr; 6264 + __entry->bbcount = event->bbcount; 6265 + ), 6266 + TP_printk("dev %d:%d %s daddr 0x%llx bbcount 0x%llx", 6267 + MAJOR(__entry->dev), MINOR(__entry->dev), 6268 + __print_symbolic(__entry->error_dev, XFS_DEVICE_STRINGS), 6269 + __entry->daddr, 6270 + __entry->bbcount) 6271 + ); 6272 + 6273 + #define FS_ERROR_STRINGS \ 6274 + { FSERR_BUFFERED_READ, "buffered_read" }, \ 6275 + { FSERR_BUFFERED_WRITE, "buffered_write" }, \ 6276 + { FSERR_DIRECTIO_READ, "directio_read" }, \ 6277 + { FSERR_DIRECTIO_WRITE, "directio_write" }, \ 6278 + { FSERR_DATA_LOST, "data_lost" }, \ 6279 + { FSERR_METADATA, "metadata" } 6280 + 6281 + TRACE_DEFINE_ENUM(FSERR_BUFFERED_READ); 6282 + TRACE_DEFINE_ENUM(FSERR_BUFFERED_WRITE); 6283 + TRACE_DEFINE_ENUM(FSERR_DIRECTIO_READ); 6284 + TRACE_DEFINE_ENUM(FSERR_DIRECTIO_WRITE); 6285 + TRACE_DEFINE_ENUM(FSERR_DATA_LOST); 6286 + TRACE_DEFINE_ENUM(FSERR_METADATA); 6287 + 6288 + TRACE_EVENT(xfs_healthmon_report_file_ioerror, 6289 + TP_PROTO(const struct xfs_healthmon *hm, 6290 + const struct fserror_event *p), 6291 + TP_ARGS(hm, p), 6292 + TP_STRUCT__entry( 6293 + __field(dev_t, dev) 6294 + __field(unsigned int, type) 6295 + __field(unsigned long long, ino) 6296 + __field(unsigned int, gen) 6297 + __field(long long, pos) 6298 + __field(unsigned long long, len) 6299 + __field(int, error) 6300 + ), 6301 + TP_fast_assign( 6302 + __entry->dev = hm->dev; 6303 + __entry->type = p->type; 6304 + __entry->ino = XFS_I(p->inode)->i_ino; 6305 + __entry->gen = p->inode->i_generation; 6306 + __entry->pos = p->pos; 6307 + __entry->len = p->len; 6308 + __entry->error = p->error; 6309 + ), 6310 + TP_printk("dev %d:%d ino 0x%llx gen 0x%x op %s pos 0x%llx bytecount 0x%llx error %d", 6311 + MAJOR(__entry->dev), MINOR(__entry->dev), 6312 + __entry->ino, 6313 + __entry->gen, 6314 + __print_symbolic(__entry->type, FS_ERROR_STRINGS), 6315 + __entry->pos, 6316 + __entry->len, 6317 + __entry->error) 6318 + ); 6319 + 6320 + TRACE_EVENT(xfs_verify_media, 6321 + TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me, 6322 + dev_t fdev, xfs_daddr_t daddr, uint64_t bbcount, 6323 + const struct folio *folio), 6324 + TP_ARGS(mp, me, fdev, daddr, bbcount, folio), 6325 + TP_STRUCT__entry( 6326 + __field(dev_t, dev) 6327 + __field(dev_t, fdev) 6328 + __field(xfs_daddr_t, start_daddr) 6329 + __field(xfs_daddr_t, end_daddr) 6330 + __field(unsigned int, flags) 6331 + __field(xfs_daddr_t, daddr) 6332 + __field(uint64_t, bbcount) 6333 + __field(unsigned int, bufsize) 6334 + ), 6335 + TP_fast_assign( 6336 + __entry->dev = mp->m_ddev_targp->bt_dev; 6337 + __entry->fdev = fdev; 6338 + __entry->start_daddr = me->me_start_daddr; 6339 + __entry->end_daddr = me->me_end_daddr; 6340 + __entry->flags = me->me_flags; 6341 + __entry->daddr = daddr; 6342 + __entry->bbcount = bbcount; 6343 + __entry->bufsize = folio_size(folio); 6344 + ), 6345 + TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx flags 0x%x daddr 0x%llx bbcount 0x%llx bufsize 0x%x", 6346 + MAJOR(__entry->dev), MINOR(__entry->dev), 6347 + MAJOR(__entry->fdev), MINOR(__entry->fdev), 6348 + __entry->start_daddr, 6349 + __entry->end_daddr, 6350 + __entry->flags, 6351 + __entry->daddr, 6352 + __entry->bbcount, 6353 + __entry->bufsize) 6354 + ); 6355 + 6356 + TRACE_EVENT(xfs_verify_media_end, 6357 + TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me, 6358 + dev_t fdev), 6359 + TP_ARGS(mp, me, fdev), 6360 + TP_STRUCT__entry( 6361 + __field(dev_t, dev) 6362 + __field(dev_t, fdev) 6363 + __field(xfs_daddr_t, start_daddr) 6364 + __field(xfs_daddr_t, end_daddr) 6365 + __field(int, ioerror) 6366 + ), 6367 + TP_fast_assign( 6368 + __entry->dev = mp->m_ddev_targp->bt_dev; 6369 + __entry->fdev = fdev; 6370 + __entry->start_daddr = me->me_start_daddr; 6371 + __entry->end_daddr = me->me_end_daddr; 6372 + __entry->ioerror = me->me_ioerror; 6373 + ), 6374 + TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx ioerror %d", 6375 + MAJOR(__entry->dev), MINOR(__entry->dev), 6376 + MAJOR(__entry->fdev), MINOR(__entry->fdev), 6377 + __entry->start_daddr, 6378 + __entry->end_daddr, 6379 + __entry->ioerror) 6380 + ); 6381 + 6382 + TRACE_EVENT(xfs_verify_media_error, 6383 + TP_PROTO(const struct xfs_mount *mp, const struct xfs_verify_media *me, 6384 + dev_t fdev, xfs_daddr_t daddr, uint64_t bbcount, 6385 + blk_status_t status), 6386 + TP_ARGS(mp, me, fdev, daddr, bbcount, status), 6387 + TP_STRUCT__entry( 6388 + __field(dev_t, dev) 6389 + __field(dev_t, fdev) 6390 + __field(xfs_daddr_t, start_daddr) 6391 + __field(xfs_daddr_t, end_daddr) 6392 + __field(unsigned int, flags) 6393 + __field(xfs_daddr_t, daddr) 6394 + __field(uint64_t, bbcount) 6395 + __field(int, error) 6396 + ), 6397 + TP_fast_assign( 6398 + __entry->dev = mp->m_ddev_targp->bt_dev; 6399 + __entry->fdev = fdev; 6400 + __entry->start_daddr = me->me_start_daddr; 6401 + __entry->end_daddr = me->me_end_daddr; 6402 + __entry->flags = me->me_flags; 6403 + __entry->daddr = daddr; 6404 + __entry->bbcount = bbcount; 6405 + __entry->error = blk_status_to_errno(status); 6406 + ), 6407 + TP_printk("dev %d:%d fdev %d:%d start_daddr 0x%llx end_daddr 0x%llx flags 0x%x daddr 0x%llx bbcount 0x%llx error %d", 6408 + MAJOR(__entry->dev), MINOR(__entry->dev), 6409 + MAJOR(__entry->fdev), MINOR(__entry->fdev), 6410 + __entry->start_daddr, 6411 + __entry->end_daddr, 6412 + __entry->flags, 6413 + __entry->daddr, 6414 + __entry->bbcount, 6415 + __entry->error) 6416 + ); 5911 6417 5912 6418 #endif /* _TRACE_XFS_H */ 5913 6419

+445

fs/xfs/xfs_verify_media.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Copyright (c) 2026 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <djwong@kernel.org> 5 + */ 6 + #include "xfs_platform.h" 7 + #include "xfs_shared.h" 8 + #include "xfs_format.h" 9 + #include "xfs_log_format.h" 10 + #include "xfs_trans_resv.h" 11 + #include "xfs_mount.h" 12 + #include "xfs_bit.h" 13 + #include "xfs_btree.h" 14 + #include "xfs_inode.h" 15 + #include "xfs_icache.h" 16 + #include "xfs_trans.h" 17 + #include "xfs_alloc.h" 18 + #include "xfs_ag.h" 19 + #include "xfs_rmap.h" 20 + #include "xfs_rmap_btree.h" 21 + #include "xfs_rtgroup.h" 22 + #include "xfs_rtrmap_btree.h" 23 + #include "xfs_health.h" 24 + #include "xfs_healthmon.h" 25 + #include "xfs_trace.h" 26 + #include "xfs_verify_media.h" 27 + 28 + #include <linux/fserror.h> 29 + 30 + struct xfs_group_data_lost { 31 + xfs_agblock_t startblock; 32 + xfs_extlen_t blockcount; 33 + }; 34 + 35 + /* Report lost file data from rmap records */ 36 + static int 37 + xfs_verify_report_data_lost( 38 + struct xfs_btree_cur *cur, 39 + const struct xfs_rmap_irec *rec, 40 + void *data) 41 + { 42 + struct xfs_mount *mp = cur->bc_mp; 43 + struct xfs_inode *ip; 44 + struct xfs_group_data_lost *lost = data; 45 + xfs_fileoff_t fileoff = rec->rm_offset; 46 + xfs_extlen_t blocks = rec->rm_blockcount; 47 + const bool is_attr = 48 + (rec->rm_flags & XFS_RMAP_ATTR_FORK); 49 + const xfs_agblock_t lost_end = 50 + lost->startblock + lost->blockcount; 51 + const xfs_agblock_t rmap_end = 52 + rec->rm_startblock + rec->rm_blockcount; 53 + int error = 0; 54 + 55 + if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner)) 56 + return 0; 57 + 58 + error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, 0, 0, &ip); 59 + if (error) 60 + return 0; 61 + 62 + if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) { 63 + xfs_bmap_mark_sick(ip, is_attr ? XFS_ATTR_FORK : XFS_DATA_FORK); 64 + goto out_rele; 65 + } 66 + 67 + if (is_attr) { 68 + xfs_inode_mark_sick(ip, XFS_SICK_INO_XATTR); 69 + goto out_rele; 70 + } 71 + 72 + if (lost->startblock > rec->rm_startblock) { 73 + fileoff += lost->startblock - rec->rm_startblock; 74 + blocks -= lost->startblock - rec->rm_startblock; 75 + } 76 + if (rmap_end > lost_end) 77 + blocks -= rmap_end - lost_end; 78 + 79 + fserror_report_data_lost(VFS_I(ip), XFS_FSB_TO_B(mp, fileoff), 80 + XFS_FSB_TO_B(mp, blocks), GFP_NOFS); 81 + 82 + out_rele: 83 + xfs_irele(ip); 84 + return 0; 85 + } 86 + 87 + /* Walk reverse mappings to look for all file data loss */ 88 + static int 89 + xfs_verify_report_losses( 90 + struct xfs_mount *mp, 91 + enum xfs_group_type type, 92 + xfs_daddr_t daddr, 93 + u64 bblen) 94 + { 95 + struct xfs_group *xg = NULL; 96 + struct xfs_trans *tp; 97 + xfs_fsblock_t start_bno, end_bno; 98 + uint32_t start_gno, end_gno; 99 + int error; 100 + 101 + if (type == XG_TYPE_RTG) { 102 + start_bno = xfs_daddr_to_rtb(mp, daddr); 103 + end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1); 104 + } else { 105 + start_bno = XFS_DADDR_TO_FSB(mp, daddr); 106 + end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1); 107 + } 108 + 109 + tp = xfs_trans_alloc_empty(mp); 110 + start_gno = xfs_fsb_to_gno(mp, start_bno, type); 111 + end_gno = xfs_fsb_to_gno(mp, end_bno, type); 112 + while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) { 113 + struct xfs_buf *agf_bp = NULL; 114 + struct xfs_rtgroup *rtg = NULL; 115 + struct xfs_btree_cur *cur; 116 + struct xfs_rmap_irec ri_low = { }; 117 + struct xfs_rmap_irec ri_high; 118 + struct xfs_group_data_lost lost; 119 + 120 + if (type == XG_TYPE_AG) { 121 + struct xfs_perag *pag = to_perag(xg); 122 + 123 + error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); 124 + if (error) { 125 + xfs_perag_put(pag); 126 + break; 127 + } 128 + 129 + cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag); 130 + } else { 131 + rtg = to_rtg(xg); 132 + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 133 + cur = xfs_rtrmapbt_init_cursor(tp, rtg); 134 + } 135 + 136 + /* 137 + * Set the rmap range from ri_low to ri_high, which represents 138 + * a [start, end] where we looking for the files or metadata. 139 + */ 140 + memset(&ri_high, 0xFF, sizeof(ri_high)); 141 + if (xg->xg_gno == start_gno) 142 + ri_low.rm_startblock = 143 + xfs_fsb_to_gbno(mp, start_bno, type); 144 + if (xg->xg_gno == end_gno) 145 + ri_high.rm_startblock = 146 + xfs_fsb_to_gbno(mp, end_bno, type); 147 + 148 + lost.startblock = ri_low.rm_startblock; 149 + lost.blockcount = min(xg->xg_block_count, 150 + ri_high.rm_startblock + 1) - 151 + ri_low.rm_startblock; 152 + 153 + error = xfs_rmap_query_range(cur, &ri_low, &ri_high, 154 + xfs_verify_report_data_lost, &lost); 155 + xfs_btree_del_cursor(cur, error); 156 + if (agf_bp) 157 + xfs_trans_brelse(tp, agf_bp); 158 + if (rtg) 159 + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 160 + if (error) { 161 + xfs_group_put(xg); 162 + break; 163 + } 164 + } 165 + 166 + xfs_trans_cancel(tp); 167 + return 0; 168 + } 169 + 170 + /* 171 + * Compute the desired verify IO size. 172 + * 173 + * To minimize command overhead, we'd like to create bios that are 1MB, though 174 + * we allow the user to ask for a smaller size. 175 + */ 176 + static unsigned int 177 + xfs_verify_iosize( 178 + const struct xfs_verify_media *me, 179 + struct xfs_buftarg *btp, 180 + uint64_t bbcount) 181 + { 182 + unsigned int iosize = 183 + min_not_zero(SZ_1M, me->me_max_io_size); 184 + 185 + BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT); 186 + ASSERT(BBTOB(bbcount) >= bdev_logical_block_size(btp->bt_bdev)); 187 + 188 + return clamp(iosize, bdev_logical_block_size(btp->bt_bdev), 189 + BBTOB(bbcount)); 190 + } 191 + 192 + /* Allocate as much memory as we can get for verification buffer. */ 193 + static struct folio * 194 + xfs_verify_alloc_folio( 195 + const unsigned int iosize) 196 + { 197 + unsigned int order = get_order(iosize); 198 + 199 + while (order > 0) { 200 + struct folio *folio = 201 + folio_alloc(GFP_KERNEL | __GFP_NORETRY, order); 202 + 203 + if (folio) 204 + return folio; 205 + order--; 206 + } 207 + 208 + return folio_alloc(GFP_KERNEL, 0); 209 + } 210 + 211 + /* Report any kind of problem verifying media */ 212 + static void 213 + xfs_verify_media_error( 214 + struct xfs_mount *mp, 215 + struct xfs_verify_media *me, 216 + struct xfs_buftarg *btp, 217 + xfs_daddr_t daddr, 218 + unsigned int bio_bbcount, 219 + blk_status_t bio_status) 220 + { 221 + trace_xfs_verify_media_error(mp, me, btp->bt_bdev->bd_dev, daddr, 222 + bio_bbcount, bio_status); 223 + 224 + /* 225 + * Pass any error, I/O or otherwise, up to the caller if we didn't 226 + * successfully verify any bytes at all. 227 + */ 228 + if (me->me_start_daddr == daddr) 229 + me->me_ioerror = -blk_status_to_errno(bio_status); 230 + 231 + /* 232 + * PI validation failures, medium errors, or general IO errors are 233 + * treated as indicators of data loss. Everything else are (hopefully) 234 + * transient errors and are not reported to healthmon or fsnotify. 235 + */ 236 + switch (bio_status) { 237 + case BLK_STS_PROTECTION: 238 + case BLK_STS_IOERR: 239 + case BLK_STS_MEDIUM: 240 + break; 241 + default: 242 + return; 243 + } 244 + 245 + if (!(me->me_flags & XFS_VERIFY_MEDIA_REPORT)) 246 + return; 247 + 248 + xfs_healthmon_report_media(mp, me->me_dev, daddr, bio_bbcount); 249 + 250 + if (!xfs_has_rmapbt(mp)) 251 + return; 252 + 253 + switch (me->me_dev) { 254 + case XFS_DEV_DATA: 255 + xfs_verify_report_losses(mp, XG_TYPE_AG, daddr, bio_bbcount); 256 + break; 257 + case XFS_DEV_RT: 258 + xfs_verify_report_losses(mp, XG_TYPE_RTG, daddr, bio_bbcount); 259 + break; 260 + } 261 + } 262 + 263 + /* Verify the media of an xfs device by submitting read requests to the disk. */ 264 + static int 265 + xfs_verify_media( 266 + struct xfs_mount *mp, 267 + struct xfs_verify_media *me) 268 + { 269 + struct xfs_buftarg *btp = NULL; 270 + struct bio *bio; 271 + struct folio *folio; 272 + xfs_daddr_t daddr; 273 + uint64_t bbcount; 274 + int error = 0; 275 + 276 + me->me_ioerror = 0; 277 + 278 + switch (me->me_dev) { 279 + case XFS_DEV_DATA: 280 + btp = mp->m_ddev_targp; 281 + break; 282 + case XFS_DEV_LOG: 283 + if (mp->m_logdev_targp->bt_bdev != mp->m_ddev_targp->bt_bdev) 284 + btp = mp->m_logdev_targp; 285 + break; 286 + case XFS_DEV_RT: 287 + btp = mp->m_rtdev_targp; 288 + break; 289 + } 290 + if (!btp) 291 + return -ENODEV; 292 + 293 + /* 294 + * If the caller told us to verify beyond the end of the disk, tell the 295 + * user exactly where that was. 296 + */ 297 + if (me->me_end_daddr > btp->bt_nr_sectors) 298 + me->me_end_daddr = btp->bt_nr_sectors; 299 + 300 + /* start and end have to be aligned to the lba size */ 301 + if (!IS_ALIGNED(BBTOB(me->me_start_daddr | me->me_end_daddr), 302 + bdev_logical_block_size(btp->bt_bdev))) 303 + return -EINVAL; 304 + 305 + /* 306 + * end_daddr is the exclusive end of the range, so if start_daddr 307 + * reaches there (or beyond), there's no work to be done. 308 + */ 309 + if (me->me_start_daddr >= me->me_end_daddr) 310 + return 0; 311 + 312 + /* 313 + * There are three ranges involved here: 314 + * 315 + * - [me->me_start_daddr, me->me_end_daddr) is the range that the 316 + * user wants to verify. end_daddr can be beyond the end of the 317 + * disk; we'll constrain it to the end if necessary. 318 + * 319 + * - [daddr, me->me_end_daddr) is the range that we have not yet 320 + * verified. We update daddr after each successful read. 321 + * me->me_start_daddr is set to daddr before returning. 322 + * 323 + * - [daddr, daddr + bio_bbcount) is the range that we're currently 324 + * verifying. 325 + */ 326 + daddr = me->me_start_daddr; 327 + bbcount = min_t(sector_t, me->me_end_daddr, btp->bt_nr_sectors) - 328 + me->me_start_daddr; 329 + 330 + folio = xfs_verify_alloc_folio(xfs_verify_iosize(me, btp, bbcount)); 331 + if (!folio) 332 + return -ENOMEM; 333 + 334 + trace_xfs_verify_media(mp, me, btp->bt_bdev->bd_dev, daddr, bbcount, 335 + folio); 336 + 337 + bio = bio_alloc(btp->bt_bdev, 1, REQ_OP_READ, GFP_KERNEL); 338 + if (!bio) { 339 + error = -ENOMEM; 340 + goto out_folio; 341 + } 342 + 343 + while (bbcount > 0) { 344 + unsigned int bio_bbcount; 345 + blk_status_t bio_status; 346 + 347 + bio_reset(bio, btp->bt_bdev, REQ_OP_READ); 348 + bio->bi_iter.bi_sector = daddr; 349 + bio_add_folio_nofail(bio, folio, 350 + min(bbcount << SECTOR_SHIFT, folio_size(folio)), 351 + 0); 352 + 353 + /* 354 + * Save the length of the bio before we submit it, because we 355 + * need the original daddr and length for reporting IO errors 356 + * if the bio fails. 357 + */ 358 + bio_bbcount = bio->bi_iter.bi_size >> SECTOR_SHIFT; 359 + submit_bio_wait(bio); 360 + bio_status = bio->bi_status; 361 + if (bio_status != BLK_STS_OK) { 362 + xfs_verify_media_error(mp, me, btp, daddr, bio_bbcount, 363 + bio_status); 364 + error = 0; 365 + break; 366 + } 367 + 368 + daddr += bio_bbcount; 369 + bbcount -= bio_bbcount; 370 + 371 + if (bbcount == 0) 372 + break; 373 + 374 + if (me->me_rest_us) { 375 + ktime_t expires; 376 + 377 + expires = ktime_add_ns(ktime_get(), 378 + me->me_rest_us * 1000); 379 + set_current_state(TASK_KILLABLE); 380 + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); 381 + } 382 + 383 + if (fatal_signal_pending(current)) { 384 + error = -EINTR; 385 + break; 386 + } 387 + 388 + cond_resched(); 389 + } 390 + 391 + bio_put(bio); 392 + out_folio: 393 + folio_put(folio); 394 + 395 + if (error) 396 + return error; 397 + 398 + /* 399 + * Advance start_daddr to the end of what we verified if there wasn't 400 + * an operational error. 401 + */ 402 + me->me_start_daddr = daddr; 403 + trace_xfs_verify_media_end(mp, me, btp->bt_bdev->bd_dev); 404 + return 0; 405 + } 406 + 407 + int 408 + xfs_ioc_verify_media( 409 + struct file *file, 410 + struct xfs_verify_media __user *arg) 411 + { 412 + struct xfs_verify_media me; 413 + struct xfs_inode *ip = XFS_I(file_inode(file)); 414 + struct xfs_mount *mp = ip->i_mount; 415 + int error; 416 + 417 + if (!capable(CAP_SYS_ADMIN)) 418 + return -EPERM; 419 + 420 + if (copy_from_user(&me, arg, sizeof(me))) 421 + return -EFAULT; 422 + 423 + if (me.me_pad) 424 + return -EINVAL; 425 + if (me.me_flags & ~XFS_VERIFY_MEDIA_FLAGS) 426 + return -EINVAL; 427 + 428 + switch (me.me_dev) { 429 + case XFS_DEV_DATA: 430 + case XFS_DEV_LOG: 431 + case XFS_DEV_RT: 432 + break; 433 + default: 434 + return -EINVAL; 435 + } 436 + 437 + error = xfs_verify_media(mp, &me); 438 + if (error) 439 + return error; 440 + 441 + if (copy_to_user(arg, &me, sizeof(me))) 442 + return -EFAULT; 443 + 444 + return 0; 445 + }

+13

fs/xfs/xfs_verify_media.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Copyright (c) 2026 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <djwong@kernel.org> 5 + */ 6 + #ifndef __XFS_VERIFY_MEDIA_H__ 7 + #define __XFS_VERIFY_MEDIA_H__ 8 + 9 + struct xfs_verify_media; 10 + int xfs_ioc_verify_media(struct file *file, 11 + struct xfs_verify_media __user *arg); 12 + 13 + #endif /* __XFS_VERIFY_MEDIA_H__ */

+7

include/linux/fs/super_types.h

··· 35 35 struct workqueue_struct; 36 36 struct writeback_control; 37 37 struct xattr_handler; 38 + struct fserror_event; 38 39 39 40 extern struct super_block *blockdev_superblock; 40 41 ··· 125 124 */ 126 125 int (*remove_bdev)(struct super_block *sb, struct block_device *bdev); 127 126 void (*shutdown)(struct super_block *sb); 127 + 128 + /* Report a filesystem error */ 129 + void (*report_error)(const struct fserror_event *event); 128 130 }; 129 131 130 132 struct super_block { ··· 272 268 spinlock_t s_inode_wblist_lock; 273 269 struct list_head s_inodes_wb; /* writeback inodes */ 274 270 long s_min_writeback_pages; 271 + 272 + /* number of fserrors that are being sent to fsnotify/filesystems */ 273 + refcount_t s_pending_errors; 275 274 } __randomize_layout; 276 275 277 276 /*

+75

include/linux/fserror.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Copyright (c) 2025 Oracle. All Rights Reserved. 4 + * Author: Darrick J. Wong <djwong@kernel.org> 5 + */ 6 + #ifndef _LINUX_FSERROR_H__ 7 + #define _LINUX_FSERROR_H__ 8 + 9 + void fserror_mount(struct super_block *sb); 10 + void fserror_unmount(struct super_block *sb); 11 + 12 + enum fserror_type { 13 + /* pagecache I/O failed */ 14 + FSERR_BUFFERED_READ, 15 + FSERR_BUFFERED_WRITE, 16 + 17 + /* direct I/O failed */ 18 + FSERR_DIRECTIO_READ, 19 + FSERR_DIRECTIO_WRITE, 20 + 21 + /* out of band media error reported */ 22 + FSERR_DATA_LOST, 23 + 24 + /* filesystem metadata */ 25 + FSERR_METADATA, 26 + }; 27 + 28 + struct fserror_event { 29 + struct work_struct work; 30 + struct super_block *sb; 31 + struct inode *inode; 32 + loff_t pos; 33 + u64 len; 34 + enum fserror_type type; 35 + 36 + /* negative error number */ 37 + int error; 38 + }; 39 + 40 + void fserror_report(struct super_block *sb, struct inode *inode, 41 + enum fserror_type type, loff_t pos, u64 len, int error, 42 + gfp_t gfp); 43 + 44 + static inline void fserror_report_io(struct inode *inode, 45 + enum fserror_type type, loff_t pos, 46 + u64 len, int error, gfp_t gfp) 47 + { 48 + fserror_report(inode->i_sb, inode, type, pos, len, error, gfp); 49 + } 50 + 51 + static inline void fserror_report_data_lost(struct inode *inode, loff_t pos, 52 + u64 len, gfp_t gfp) 53 + { 54 + fserror_report(inode->i_sb, inode, FSERR_DATA_LOST, pos, len, -EIO, 55 + gfp); 56 + } 57 + 58 + static inline void fserror_report_file_metadata(struct inode *inode, int error, 59 + gfp_t gfp) 60 + { 61 + fserror_report(inode->i_sb, inode, FSERR_METADATA, 0, 0, error, gfp); 62 + } 63 + 64 + static inline void fserror_report_metadata(struct super_block *sb, int error, 65 + gfp_t gfp) 66 + { 67 + fserror_report(sb, NULL, FSERR_METADATA, 0, 0, error, gfp); 68 + } 69 + 70 + static inline void fserror_report_shutdown(struct super_block *sb, gfp_t gfp) 71 + { 72 + fserror_report(sb, NULL, FSERR_METADATA, 0, 0, -ESHUTDOWN, gfp); 73 + } 74 + 75 + #endif /* _LINUX_FSERROR_H__ */

-3

include/linux/jbd2.h

··· 1815 1815 1816 1816 #endif /* __KERNEL__ */ 1817 1817 1818 - #define EFSBADCRC EBADMSG /* Bad CRC detected */ 1819 - #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 1820 - 1821 1818 #endif /* _LINUX_JBD2_H */

+2

include/uapi/asm-generic/errno.h

··· 55 55 #define EMULTIHOP 72 /* Multihop attempted */ 56 56 #define EDOTDOT 73 /* RFS specific error */ 57 57 #define EBADMSG 74 /* Not a data message */ 58 + #define EFSBADCRC EBADMSG /* Bad CRC detected */ 58 59 #define EOVERFLOW 75 /* Value too large for defined data type */ 59 60 #define ENOTUNIQ 76 /* Name not unique on network */ 60 61 #define EBADFD 77 /* File descriptor in bad state */ ··· 99 98 #define EINPROGRESS 115 /* Operation now in progress */ 100 99 #define ESTALE 116 /* Stale file handle */ 101 100 #define EUCLEAN 117 /* Structure needs cleaning */ 101 + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 102 102 #define ENOTNAM 118 /* Not a XENIX named type file */ 103 103 #define ENAVAIL 119 /* No XENIX semaphores available */ 104 104 #define EISNAM 120 /* Is a named type file */

+2

tools/arch/alpha/include/uapi/asm/errno.h

··· 55 55 #define ENOSR 82 /* Out of streams resources */ 56 56 #define ETIME 83 /* Timer expired */ 57 57 #define EBADMSG 84 /* Not a data message */ 58 + #define EFSBADCRC EBADMSG /* Bad CRC detected */ 58 59 #define EPROTO 85 /* Protocol error */ 59 60 #define ENODATA 86 /* No data available */ 60 61 #define ENOSTR 87 /* Device not a stream */ ··· 97 96 #define EREMCHG 115 /* Remote address changed */ 98 97 99 98 #define EUCLEAN 117 /* Structure needs cleaning */ 99 + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 100 100 #define ENOTNAM 118 /* Not a XENIX named type file */ 101 101 #define ENAVAIL 119 /* No XENIX semaphores available */ 102 102 #define EISNAM 120 /* Is a named type file */

+2

tools/arch/mips/include/uapi/asm/errno.h

··· 50 50 #define EDOTDOT 73 /* RFS specific error */ 51 51 #define EMULTIHOP 74 /* Multihop attempted */ 52 52 #define EBADMSG 77 /* Not a data message */ 53 + #define EFSBADCRC EBADMSG /* Bad CRC detected */ 53 54 #define ENAMETOOLONG 78 /* File name too long */ 54 55 #define EOVERFLOW 79 /* Value too large for defined data type */ 55 56 #define ENOTUNIQ 80 /* Name not unique on network */ ··· 89 88 #define EISCONN 133 /* Transport endpoint is already connected */ 90 89 #define ENOTCONN 134 /* Transport endpoint is not connected */ 91 90 #define EUCLEAN 135 /* Structure needs cleaning */ 91 + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 92 92 #define ENOTNAM 137 /* Not a XENIX named type file */ 93 93 #define ENAVAIL 138 /* No XENIX semaphores available */ 94 94 #define EISNAM 139 /* Is a named type file */

+2

tools/arch/parisc/include/uapi/asm/errno.h

··· 36 36 37 37 #define EDOTDOT 66 /* RFS specific error */ 38 38 #define EBADMSG 67 /* Not a data message */ 39 + #define EFSBADCRC EBADMSG /* Bad CRC detected */ 39 40 #define EUSERS 68 /* Too many users */ 40 41 #define EDQUOT 69 /* Quota exceeded */ 41 42 #define ESTALE 70 /* Stale file handle */ ··· 63 62 #define ERESTART 175 /* Interrupted system call should be restarted */ 64 63 #define ESTRPIPE 176 /* Streams pipe error */ 65 64 #define EUCLEAN 177 /* Structure needs cleaning */ 65 + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 66 66 #define ENOTNAM 178 /* Not a XENIX named type file */ 67 67 #define ENAVAIL 179 /* No XENIX semaphores available */ 68 68 #define EISNAM 180 /* Is a named type file */

+2

tools/arch/sparc/include/uapi/asm/errno.h

··· 48 48 #define ENOSR 74 /* Out of streams resources */ 49 49 #define ENOMSG 75 /* No message of desired type */ 50 50 #define EBADMSG 76 /* Not a data message */ 51 + #define EFSBADCRC EBADMSG /* Bad CRC detected */ 51 52 #define EIDRM 77 /* Identifier removed */ 52 53 #define EDEADLK 78 /* Resource deadlock would occur */ 53 54 #define ENOLCK 79 /* No record locks available */ ··· 92 91 #define ENOTUNIQ 115 /* Name not unique on network */ 93 92 #define ERESTART 116 /* Interrupted syscall should be restarted */ 94 93 #define EUCLEAN 117 /* Structure needs cleaning */ 94 + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 95 95 #define ENOTNAM 118 /* Not a XENIX named type file */ 96 96 #define ENAVAIL 119 /* No XENIX semaphores available */ 97 97 #define EISNAM 120 /* Is a named type file */

+2

tools/include/uapi/asm-generic/errno.h

··· 55 55 #define EMULTIHOP 72 /* Multihop attempted */ 56 56 #define EDOTDOT 73 /* RFS specific error */ 57 57 #define EBADMSG 74 /* Not a data message */ 58 + #define EFSBADCRC EBADMSG /* Bad CRC detected */ 58 59 #define EOVERFLOW 75 /* Value too large for defined data type */ 59 60 #define ENOTUNIQ 76 /* Name not unique on network */ 60 61 #define EBADFD 77 /* File descriptor in bad state */ ··· 99 98 #define EINPROGRESS 115 /* Operation now in progress */ 100 99 #define ESTALE 116 /* Stale file handle */ 101 100 #define EUCLEAN 117 /* Structure needs cleaning */ 101 + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ 102 102 #define ENOTNAM 118 /* Not a XENIX named type file */ 103 103 #define ENAVAIL 119 /* No XENIX semaphores available */ 104 104 #define EISNAM 120 /* Is a named type file */

Configure Feed

Configure Feed