Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'fsnotify_hsm_for_v6.14-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs

Pull fsnotify pre-content notification support from Jan Kara:
"This introduces a new fsnotify event (FS_PRE_ACCESS) that gets
generated before a file contents is accessed.

The event is synchronous so if there is listener for this event, the
kernel waits for reply. On success the execution continues as usual,
on failure we propagate the error to userspace. This allows userspace
to fill in file content on demand from slow storage. The context in
which the events are generated has been picked so that we don't hold
any locks and thus there's no risk of a deadlock for the userspace
handler.

The new pre-content event is available only for users with global
CAP_SYS_ADMIN capability (similarly to other parts of fanotify
functionality) and it is an administrator responsibility to make sure
the userspace event handler doesn't do stupid stuff that can DoS the
system.

Based on your feedback from the last submission, fsnotify code has
been improved and now file->f_mode encodes whether pre-content event
needs to be generated for the file so the fast path when nobody wants
pre-content event for the file just grows the additional file->f_mode
check. As a bonus this also removes the checks whether the old
FS_ACCESS event needs to be generated from the fast path. Also the
place where the event is generated during page fault has been moved so
now filemap_fault() generates the event if and only if there is no
uptodate folio in the page cache.

Also we have dropped FS_PRE_MODIFY event as current real-world users
of the pre-content functionality don't really use it so let's start
with the minimal useful feature set"

* tag 'fsnotify_hsm_for_v6.14-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs: (21 commits)
fanotify: Fix crash in fanotify_init(2)
fs: don't block write during exec on pre-content watched files
fs: enable pre-content events on supported file systems
ext4: add pre-content fsnotify hook for DAX faults
btrfs: disable defrag on pre-content watched files
xfs: add pre-content fsnotify hook for DAX faults
fsnotify: generate pre-content permission event on page fault
mm: don't allow huge faults for files with pre content watches
fanotify: disable readahead if we have pre-content watches
fanotify: allow to set errno in FAN_DENY permission response
fanotify: report file range info with pre-content events
fanotify: introduce FAN_PRE_ACCESS permission event
fsnotify: generate pre-content permission event on truncate
fsnotify: pass optional file access range in pre-content event
fsnotify: introduce pre-content permission events
fanotify: reserve event bit of deprecated FAN_DIR_MODIFY
fanotify: rename a misnamed constant
fanotify: don't skip extra event info if no info_mode is set
fsnotify: check if file is actually being watched for pre-content events on open
fsnotify: opt-in for permission events at file open time
...

+669 -106
+2 -2
fs/binfmt_elf.c
··· 1257 1257 } 1258 1258 reloc_func_desc = interp_load_addr; 1259 1259 1260 - allow_write_access(interpreter); 1260 + exe_file_allow_write_access(interpreter); 1261 1261 fput(interpreter); 1262 1262 1263 1263 kfree(interp_elf_ex); ··· 1354 1354 kfree(interp_elf_ex); 1355 1355 kfree(interp_elf_phdata); 1356 1356 out_free_file: 1357 - allow_write_access(interpreter); 1357 + exe_file_allow_write_access(interpreter); 1358 1358 if (interpreter) 1359 1359 fput(interpreter); 1360 1360 out_free_ph:
+2 -2
fs/binfmt_elf_fdpic.c
··· 394 394 goto error; 395 395 } 396 396 397 - allow_write_access(interpreter); 397 + exe_file_allow_write_access(interpreter); 398 398 fput(interpreter); 399 399 interpreter = NULL; 400 400 } ··· 467 467 468 468 error: 469 469 if (interpreter) { 470 - allow_write_access(interpreter); 470 + exe_file_allow_write_access(interpreter); 471 471 fput(interpreter); 472 472 } 473 473 kfree(interpreter_name);
+9
fs/btrfs/ioctl.c
··· 2544 2544 goto out; 2545 2545 } 2546 2546 2547 + /* 2548 + * Don't allow defrag on pre-content watched files, as it could 2549 + * populate the page cache with 0's via readahead. 2550 + */ 2551 + if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { 2552 + ret = -EINVAL; 2553 + goto out; 2554 + } 2555 + 2547 2556 if (argp) { 2548 2557 if (copy_from_user(&range, argp, sizeof(range))) { 2549 2558 ret = -EFAULT;
+1 -1
fs/btrfs/super.c
··· 961 961 #endif 962 962 sb->s_xattr = btrfs_xattr_handlers; 963 963 sb->s_time_gran = 1; 964 - sb->s_iflags |= SB_I_CGROUPWB; 964 + sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM; 965 965 966 966 err = super_setup_bdi(sb); 967 967 if (err) {
+4 -4
fs/exec.c
··· 913 913 path_noexec(&file->f_path)) 914 914 return ERR_PTR(-EACCES); 915 915 916 - err = deny_write_access(file); 916 + err = exe_file_deny_write_access(file); 917 917 if (err) 918 918 return ERR_PTR(err); 919 919 ··· 928 928 * Returns ERR_PTR on failure or allocated struct file on success. 929 929 * 930 930 * As this is a wrapper for the internal do_open_execat(), callers 931 - * must call allow_write_access() before fput() on release. Also see 931 + * must call exe_file_allow_write_access() before fput() on release. Also see 932 932 * do_close_execat(). 933 933 */ 934 934 struct file *open_exec(const char *name) ··· 1493 1493 { 1494 1494 if (!file) 1495 1495 return; 1496 - allow_write_access(file); 1496 + exe_file_allow_write_access(file); 1497 1497 fput(file); 1498 1498 } 1499 1499 ··· 1822 1822 bprm->file = bprm->interpreter; 1823 1823 bprm->interpreter = NULL; 1824 1824 1825 - allow_write_access(exec); 1825 + exe_file_allow_write_access(exec); 1826 1826 if (unlikely(bprm->have_execfd)) { 1827 1827 if (bprm->executable) { 1828 1828 fput(exec);
+3
fs/ext4/file.c
··· 756 756 return VM_FAULT_SIGBUS; 757 757 } 758 758 } else { 759 + result = filemap_fsnotify_fault(vmf); 760 + if (unlikely(result)) 761 + return result; 759 762 filemap_invalidate_lock_shared(mapping); 760 763 } 761 764 result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops);
+3
fs/ext4/super.c
··· 5301 5301 /* i_version is always enabled now */ 5302 5302 sb->s_flags |= SB_I_VERSION; 5303 5303 5304 + /* HSM events are allowed by default. */ 5305 + sb->s_iflags |= SB_I_ALLOW_HSM; 5306 + 5304 5307 err = ext4_check_feature_compatibility(sb, es, silent); 5305 5308 if (err) 5306 5309 goto failed_mount;
+2 -2
fs/fcntl.c
··· 1158 1158 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY 1159 1159 * is defined as O_NONBLOCK on some platforms and not on others. 1160 1160 */ 1161 - BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != 1161 + BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != 1162 1162 HWEIGHT32( 1163 1163 (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) | 1164 - __FMODE_EXEC | __FMODE_NONOTIFY)); 1164 + __FMODE_EXEC)); 1165 1165 1166 1166 fasync_cache = kmem_cache_create("fasync_cache", 1167 1167 sizeof(struct fasync_struct), 0,
+24 -7
fs/notify/fanotify/fanotify.c
··· 223 223 struct fanotify_perm_event *event, 224 224 struct fsnotify_iter_info *iter_info) 225 225 { 226 - int ret; 226 + int ret, errno; 227 227 228 228 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 229 229 ··· 262 262 ret = 0; 263 263 break; 264 264 case FAN_DENY: 265 + /* Check custom errno from pre-content events */ 266 + errno = fanotify_get_response_errno(event->response); 267 + if (errno) { 268 + ret = -errno; 269 + break; 270 + } 271 + fallthrough; 265 272 default: 266 273 ret = -EPERM; 267 274 } 268 275 269 276 /* Check if the response should be audited */ 270 - if (event->response & FAN_AUDIT) 271 - audit_fanotify(event->response & ~FAN_AUDIT, 272 - &event->audit_rule); 277 + if (event->response & FAN_AUDIT) { 278 + u32 response = event->response & 279 + (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS); 280 + audit_fanotify(response & ~FAN_AUDIT, &event->audit_rule); 281 + } 273 282 274 283 pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, 275 284 group, event, ret); ··· 557 548 return &pevent->fae; 558 549 } 559 550 560 - static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path, 551 + static struct fanotify_event *fanotify_alloc_perm_event(const void *data, 552 + int data_type, 561 553 gfp_t gfp) 562 554 { 555 + const struct path *path = fsnotify_data_path(data, data_type); 556 + const struct file_range *range = 557 + fsnotify_data_file_range(data, data_type); 563 558 struct fanotify_perm_event *pevent; 564 559 565 560 pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp); ··· 577 564 pevent->hdr.len = 0; 578 565 pevent->state = FAN_EVENT_INIT; 579 566 pevent->path = *path; 567 + /* NULL ppos means no range info */ 568 + pevent->ppos = range ? &range->pos : NULL; 569 + pevent->count = range ? range->count : 0; 580 570 path_get(path); 581 571 582 572 return &pevent->fae; ··· 817 801 old_memcg = set_active_memcg(group->memcg); 818 802 819 803 if (fanotify_is_perm_event(mask)) { 820 - event = fanotify_alloc_perm_event(path, gfp); 804 + event = fanotify_alloc_perm_event(data, data_type, gfp); 821 805 } else if (fanotify_is_error_event(mask)) { 822 806 event = fanotify_alloc_error_event(group, fsid, data, 823 807 data_type, &hash); ··· 925 909 BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); 926 910 BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR); 927 911 BUILD_BUG_ON(FAN_RENAME != FS_RENAME); 912 + BUILD_BUG_ON(FAN_PRE_ACCESS != FS_PRE_ACCESS); 928 913 929 - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 21); 914 + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 22); 930 915 931 916 mask = fanotify_group_event_mask(group, iter_info, &match_mask, 932 917 mask, data, data_type, dir);
+15
fs/notify/fanotify/fanotify.h
··· 425 425 struct fanotify_perm_event { 426 426 struct fanotify_event fae; 427 427 struct path path; 428 + const loff_t *ppos; /* optional file range info */ 429 + size_t count; 428 430 u32 response; /* userspace answer to the event */ 429 431 unsigned short state; /* state of the event */ 430 432 int fd; /* fd we passed to userspace for this event */ ··· 446 444 { 447 445 return IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS) && 448 446 mask & FANOTIFY_PERM_EVENTS; 447 + } 448 + 449 + static inline bool fanotify_event_has_access_range(struct fanotify_event *event) 450 + { 451 + if (!(event->mask & FANOTIFY_PRE_CONTENT_EVENTS)) 452 + return false; 453 + 454 + return FANOTIFY_PERM(event)->ppos; 449 455 } 450 456 451 457 static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse) ··· 527 517 mflags |= FAN_MARK_IGNORE; 528 518 529 519 return mflags; 520 + } 521 + 522 + static inline u32 fanotify_get_response_errno(int res) 523 + { 524 + return (res >> FAN_ERRNO_SHIFT) & FAN_ERRNO_MASK; 530 525 }
+120 -30
fs/notify/fanotify/fanotify_user.c
··· 100 100 * 101 101 * Internal and external open flags are stored together in field f_flags of 102 102 * struct file. Only external open flags shall be allowed in event_f_flags. 103 - * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be 104 - * excluded. 103 + * Internal flags like FMODE_EXEC shall be excluded. 105 104 */ 106 105 #define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \ 107 106 O_ACCMODE | O_APPEND | O_NONBLOCK | \ ··· 117 118 #define FANOTIFY_EVENT_ALIGN 4 118 119 #define FANOTIFY_FID_INFO_HDR_LEN \ 119 120 (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) 120 - #define FANOTIFY_PIDFD_INFO_HDR_LEN \ 121 + #define FANOTIFY_PIDFD_INFO_LEN \ 121 122 sizeof(struct fanotify_event_info_pidfd) 122 123 #define FANOTIFY_ERROR_INFO_LEN \ 123 124 (sizeof(struct fanotify_event_info_error)) 125 + #define FANOTIFY_RANGE_INFO_LEN \ 126 + (sizeof(struct fanotify_event_info_range)) 124 127 125 128 static int fanotify_fid_info_len(int fh_len, int name_len) 126 129 { ··· 160 159 int fh_len; 161 160 int dot_len = 0; 162 161 163 - if (!info_mode) 164 - return event_len; 165 - 166 162 if (fanotify_is_error_event(event->mask)) 167 163 event_len += FANOTIFY_ERROR_INFO_LEN; 168 164 ··· 174 176 dot_len = 1; 175 177 } 176 178 177 - if (info_mode & FAN_REPORT_PIDFD) 178 - event_len += FANOTIFY_PIDFD_INFO_HDR_LEN; 179 - 180 179 if (fanotify_event_has_object_fh(event)) { 181 180 fh_len = fanotify_event_object_fh_len(event); 182 181 event_len += fanotify_fid_info_len(fh_len, dot_len); 183 182 } 183 + 184 + if (info_mode & FAN_REPORT_PIDFD) 185 + event_len += FANOTIFY_PIDFD_INFO_LEN; 186 + 187 + if (fanotify_event_has_access_range(event)) 188 + event_len += FANOTIFY_RANGE_INFO_LEN; 184 189 185 190 return event_len; 186 191 } ··· 259 258 return client_fd; 260 259 261 260 /* 262 - * we need a new file handle for the userspace program so it can read even if it was 263 - * originally opened O_WRONLY. 261 + * We provide an fd for the userspace program, so it could access the 262 + * file without generating fanotify events itself. 264 263 */ 265 - new_file = dentry_open(path, 266 - group->fanotify_data.f_flags | __FMODE_NONOTIFY, 267 - current_cred()); 264 + new_file = dentry_open_nonotify(path, group->fanotify_data.f_flags, 265 + current_cred()); 268 266 if (IS_ERR(new_file)) { 269 267 put_unused_fd(client_fd); 270 268 client_fd = PTR_ERR(new_file); ··· 327 327 struct fanotify_perm_event *event; 328 328 int fd = response_struct->fd; 329 329 u32 response = response_struct->response; 330 + int errno = fanotify_get_response_errno(response); 330 331 int ret = info_len; 331 332 struct fanotify_response_info_audit_rule friar; 332 333 333 - pr_debug("%s: group=%p fd=%d response=%u buf=%p size=%zu\n", __func__, 334 - group, fd, response, info, info_len); 334 + pr_debug("%s: group=%p fd=%d response=%x errno=%d buf=%p size=%zu\n", 335 + __func__, group, fd, response, errno, info, info_len); 335 336 /* 336 337 * make sure the response is valid, if invalid we do nothing and either 337 338 * userspace can send a valid response or we will clean it up after the ··· 343 342 344 343 switch (response & FANOTIFY_RESPONSE_ACCESS) { 345 344 case FAN_ALLOW: 345 + if (errno) 346 + return -EINVAL; 347 + break; 346 348 case FAN_DENY: 349 + /* Custom errno is supported only for pre-content groups */ 350 + if (errno && group->priority != FSNOTIFY_PRIO_PRE_CONTENT) 351 + return -EINVAL; 352 + 353 + /* 354 + * Limit errno to values expected on open(2)/read(2)/write(2) 355 + * of regular files. 356 + */ 357 + switch (errno) { 358 + case 0: 359 + case EIO: 360 + case EPERM: 361 + case EBUSY: 362 + case ETXTBSY: 363 + case EAGAIN: 364 + case ENOSPC: 365 + case EDQUOT: 366 + break; 367 + default: 368 + return -EINVAL; 369 + } 347 370 break; 348 371 default: 349 372 return -EINVAL; ··· 531 506 size_t count) 532 507 { 533 508 struct fanotify_event_info_pidfd info = { }; 534 - size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; 509 + size_t info_len = FANOTIFY_PIDFD_INFO_LEN; 535 510 536 511 if (WARN_ON_ONCE(info_len > count)) 537 512 return -EFAULT; ··· 539 514 info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; 540 515 info.hdr.len = info_len; 541 516 info.pidfd = pidfd; 517 + 518 + if (copy_to_user(buf, &info, info_len)) 519 + return -EFAULT; 520 + 521 + return info_len; 522 + } 523 + 524 + static size_t copy_range_info_to_user(struct fanotify_event *event, 525 + char __user *buf, int count) 526 + { 527 + struct fanotify_perm_event *pevent = FANOTIFY_PERM(event); 528 + struct fanotify_event_info_range info = { }; 529 + size_t info_len = FANOTIFY_RANGE_INFO_LEN; 530 + 531 + if (WARN_ON_ONCE(info_len > count)) 532 + return -EFAULT; 533 + 534 + if (WARN_ON_ONCE(!pevent->ppos)) 535 + return -EINVAL; 536 + 537 + info.hdr.info_type = FAN_EVENT_INFO_TYPE_RANGE; 538 + info.hdr.len = info_len; 539 + info.offset = *(pevent->ppos); 540 + info.count = pevent->count; 542 541 543 542 if (copy_to_user(buf, &info, info_len)) 544 543 return -EFAULT; ··· 691 642 total_bytes += ret; 692 643 } 693 644 645 + if (fanotify_event_has_access_range(event)) { 646 + ret = copy_range_info_to_user(event, buf, count); 647 + if (ret < 0) 648 + return ret; 649 + buf += ret; 650 + count -= ret; 651 + total_bytes += ret; 652 + } 653 + 694 654 return total_bytes; 695 655 } 696 656 ··· 814 756 buf += FAN_EVENT_METADATA_LEN; 815 757 count -= FAN_EVENT_METADATA_LEN; 816 758 817 - if (info_mode) { 818 - ret = copy_info_records_to_user(event, info, info_mode, pidfd, 819 - buf, count); 820 - if (ret < 0) 821 - goto out_close_fd; 822 - } 759 + ret = copy_info_records_to_user(event, info, info_mode, pidfd, 760 + buf, count); 761 + if (ret < 0) 762 + goto out_close_fd; 823 763 824 764 if (f) 825 765 fd_install(fd, f); ··· 1350 1294 } 1351 1295 1352 1296 static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, 1353 - unsigned int fan_flags) 1297 + __u32 mask, unsigned int fan_flags) 1354 1298 { 1355 1299 /* 1356 1300 * Non evictable mark cannot be downgraded to evictable mark. ··· 1375 1319 if (fan_flags & FAN_MARK_IGNORE && 1376 1320 !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && 1377 1321 fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) 1322 + return -EEXIST; 1323 + 1324 + /* For now pre-content events are not generated for directories */ 1325 + mask |= fsn_mark->mask; 1326 + if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR) 1378 1327 return -EEXIST; 1379 1328 1380 1329 return 0; ··· 1408 1347 /* 1409 1348 * Check if requested mark flags conflict with an existing mark flags. 1410 1349 */ 1411 - ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags); 1350 + ret = fanotify_may_update_existing_mark(fsn_mark, mask, fan_flags); 1412 1351 if (ret) 1413 1352 goto out; 1414 1353 ··· 1470 1409 unsigned int fid_mode = flags & FANOTIFY_FID_BITS; 1471 1410 unsigned int class = flags & FANOTIFY_CLASS_BITS; 1472 1411 unsigned int internal_flags = 0; 1412 + struct file *file; 1473 1413 1474 1414 pr_debug("%s: flags=%x event_f_flags=%x\n", 1475 1415 __func__, flags, event_f_flags); ··· 1539 1477 (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID))) 1540 1478 return -EINVAL; 1541 1479 1542 - f_flags = O_RDWR | __FMODE_NONOTIFY; 1480 + f_flags = O_RDWR; 1543 1481 if (flags & FAN_CLOEXEC) 1544 1482 f_flags |= O_CLOEXEC; 1545 1483 if (flags & FAN_NONBLOCK) ··· 1617 1555 goto out_destroy_group; 1618 1556 } 1619 1557 1620 - fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags); 1558 + fd = get_unused_fd_flags(f_flags); 1621 1559 if (fd < 0) 1622 1560 goto out_destroy_group; 1623 1561 1562 + file = anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, group, 1563 + f_flags, FMODE_NONOTIFY); 1564 + if (IS_ERR(file)) { 1565 + put_unused_fd(fd); 1566 + fd = PTR_ERR(file); 1567 + goto out_destroy_group; 1568 + } 1569 + fd_install(fd, file); 1624 1570 return fd; 1625 1571 1626 1572 out_destroy_group: ··· 1708 1638 unsigned int flags) 1709 1639 { 1710 1640 unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; 1641 + bool is_dir = d_is_dir(path->dentry); 1711 1642 /* Strict validation of events in non-dir inode mask with v5.17+ APIs */ 1712 1643 bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) || 1713 1644 (mask & FAN_RENAME) || 1714 1645 (flags & FAN_MARK_IGNORE); 1646 + 1647 + /* 1648 + * Filesystems need to opt-into pre-content evnets (a.k.a HSM) 1649 + * and they are only supported on regular files and directories. 1650 + */ 1651 + if (mask & FANOTIFY_PRE_CONTENT_EVENTS) { 1652 + if (!(path->mnt->mnt_sb->s_iflags & SB_I_ALLOW_HSM)) 1653 + return -EOPNOTSUPP; 1654 + if (!is_dir && !d_is_reg(path->dentry)) 1655 + return -EINVAL; 1656 + } 1715 1657 1716 1658 /* 1717 1659 * Some filesystems such as 'proc' acquire unusual locks when opening ··· 1757 1675 * but because we always allowed it, error only when using new APIs. 1758 1676 */ 1759 1677 if (strict_dir_events && mark_type == FAN_MARK_INODE && 1760 - !d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS)) 1678 + !is_dir && (mask & FANOTIFY_DIRONLY_EVENT_BITS)) 1761 1679 return -ENOTDIR; 1762 1680 1763 1681 return 0; ··· 1858 1776 return -EPERM; 1859 1777 1860 1778 /* 1861 - * Permission events require minimum priority FAN_CLASS_CONTENT. 1779 + * Permission events are not allowed for FAN_CLASS_NOTIF. 1780 + * Pre-content permission events are not allowed for FAN_CLASS_CONTENT. 1862 1781 */ 1863 1782 if (mask & FANOTIFY_PERM_EVENTS && 1864 - group->priority < FSNOTIFY_PRIO_CONTENT) 1783 + group->priority == FSNOTIFY_PRIO_NORMAL) 1784 + return -EINVAL; 1785 + else if (mask & FANOTIFY_PRE_CONTENT_EVENTS && 1786 + group->priority == FSNOTIFY_PRIO_CONTENT) 1865 1787 return -EINVAL; 1866 1788 1867 1789 if (mask & FAN_FS_ERROR && ··· 1898 1812 * useful and was not implemented. 1899 1813 */ 1900 1814 if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME)) 1815 + return -EINVAL; 1816 + 1817 + /* Pre-content events are not currently generated for directories. */ 1818 + if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR) 1901 1819 return -EINVAL; 1902 1820 1903 1821 if (mark_cmd == FAN_MARK_FLUSH) {
+81 -2
fs/notify/fsnotify.c
··· 193 193 return mask & marks_mask; 194 194 } 195 195 196 - /* Are there any inode/mount/sb objects that are interested in this event? */ 196 + /* Are there any inode/mount/sb objects that watch for these events? */ 197 197 static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask, 198 198 __u32 mask) 199 199 { ··· 201 201 READ_ONCE(inode->i_sb->s_fsnotify_mask); 202 202 203 203 return mask & marks_mask & ALL_FSNOTIFY_EVENTS; 204 + } 205 + 206 + /* Report pre-content event with optional range info */ 207 + int fsnotify_pre_content(const struct path *path, const loff_t *ppos, 208 + size_t count) 209 + { 210 + struct file_range range; 211 + 212 + /* Report page aligned range only when pos is known */ 213 + if (!ppos) 214 + return fsnotify_path(path, FS_PRE_ACCESS); 215 + 216 + range.path = path; 217 + range.pos = PAGE_ALIGN_DOWN(*ppos); 218 + range.count = PAGE_ALIGN(*ppos + count) - range.pos; 219 + 220 + return fsnotify_parent(path->dentry, FS_PRE_ACCESS, &range, 221 + FSNOTIFY_EVENT_FILE_RANGE); 204 222 } 205 223 206 224 /* ··· 641 623 } 642 624 EXPORT_SYMBOL_GPL(fsnotify); 643 625 626 + #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 627 + /* 628 + * At open time we check fsnotify_sb_has_priority_watchers() and set the 629 + * FMODE_NONOTIFY_ mode bits accordignly. 630 + * Later, fsnotify permission hooks do not check if there are permission event 631 + * watches, but that there were permission event watches at open time. 632 + */ 633 + void file_set_fsnotify_mode(struct file *file) 634 + { 635 + struct dentry *dentry = file->f_path.dentry, *parent; 636 + struct super_block *sb = dentry->d_sb; 637 + __u32 mnt_mask, p_mask; 638 + 639 + /* Is it a file opened by fanotify? */ 640 + if (FMODE_FSNOTIFY_NONE(file->f_mode)) 641 + return; 642 + 643 + /* 644 + * Permission events is a super set of pre-content events, so if there 645 + * are no permission event watchers, there are also no pre-content event 646 + * watchers and this is implied from the single FMODE_NONOTIFY_PERM bit. 647 + */ 648 + if (likely(!fsnotify_sb_has_priority_watchers(sb, 649 + FSNOTIFY_PRIO_CONTENT))) { 650 + file->f_mode |= FMODE_NONOTIFY_PERM; 651 + return; 652 + } 653 + 654 + /* 655 + * If there are permission event watchers but no pre-content event 656 + * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that. 657 + */ 658 + if ((!d_is_dir(dentry) && !d_is_reg(dentry)) || 659 + likely(!fsnotify_sb_has_priority_watchers(sb, 660 + FSNOTIFY_PRIO_PRE_CONTENT))) { 661 + file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM; 662 + return; 663 + } 664 + 665 + /* 666 + * OK, there are some pre-content watchers. Check if anybody is 667 + * watching for pre-content events on *this* file. 668 + */ 669 + mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask); 670 + if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask, 671 + FSNOTIFY_PRE_CONTENT_EVENTS))) 672 + return; 673 + 674 + /* Is parent watching for pre-content events on this file? */ 675 + if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) { 676 + parent = dget_parent(dentry); 677 + p_mask = fsnotify_inode_watches_children(d_inode(parent)); 678 + dput(parent); 679 + if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS) 680 + return; 681 + } 682 + /* Nobody watching for pre-content events from this file */ 683 + file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM; 684 + } 685 + #endif 686 + 644 687 static __init int fsnotify_init(void) 645 688 { 646 689 int ret; 647 690 648 - BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23); 691 + BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 24); 649 692 650 693 ret = init_srcu_struct(&fsnotify_mark_srcu); 651 694 if (ret)
+47 -15
fs/open.c
··· 81 81 if (!S_ISREG(inode->i_mode)) 82 82 return -EINVAL; 83 83 84 - error = mnt_want_write(path->mnt); 85 - if (error) 86 - goto out; 87 - 88 84 idmap = mnt_idmap(path->mnt); 89 85 error = inode_permission(idmap, inode, MAY_WRITE); 90 86 if (error) 91 - goto mnt_drop_write_and_out; 87 + return error; 88 + 89 + error = fsnotify_truncate_perm(path, length); 90 + if (error) 91 + return error; 92 + 93 + error = mnt_want_write(path->mnt); 94 + if (error) 95 + return error; 92 96 93 97 error = -EPERM; 94 98 if (IS_APPEND(inode)) ··· 118 114 put_write_access(inode); 119 115 mnt_drop_write_and_out: 120 116 mnt_drop_write(path->mnt); 121 - out: 117 + 122 118 return error; 123 119 } 124 120 EXPORT_SYMBOL_GPL(vfs_truncate); ··· 179 175 /* Check IS_APPEND on real upper inode */ 180 176 if (IS_APPEND(file_inode(file))) 181 177 return -EPERM; 182 - sb_start_write(inode->i_sb); 178 + 183 179 error = security_file_truncate(file); 184 - if (!error) 185 - error = do_truncate(file_mnt_idmap(file), dentry, length, 186 - ATTR_MTIME | ATTR_CTIME, file); 180 + if (error) 181 + return error; 182 + 183 + error = fsnotify_truncate_perm(&file->f_path, length); 184 + if (error) 185 + return error; 186 + 187 + sb_start_write(inode->i_sb); 188 + error = do_truncate(file_mnt_idmap(file), dentry, length, 189 + ATTR_MTIME | ATTR_CTIME, file); 187 190 sb_end_write(inode->i_sb); 188 191 189 192 return error; ··· 905 894 f->f_sb_err = file_sample_sb_err(f); 906 895 907 896 if (unlikely(f->f_flags & O_PATH)) { 908 - f->f_mode = FMODE_PATH | FMODE_OPENED; 897 + f->f_mode = FMODE_PATH | FMODE_OPENED | FMODE_NONOTIFY; 909 898 f->f_op = &empty_fops; 910 899 return 0; 911 900 } ··· 933 922 if (error) 934 923 goto cleanup_all; 935 924 925 + /* 926 + * Set FMODE_NONOTIFY_* bits according to existing permission watches. 927 + * If FMODE_NONOTIFY was already set for an fanotify fd, this doesn't 928 + * change anything. 929 + */ 930 + file_set_fsnotify_mode(f); 936 931 error = fsnotify_open_perm(f); 937 932 if (error) 938 933 goto cleanup_all; ··· 1115 1098 } 1116 1099 EXPORT_SYMBOL(dentry_open); 1117 1100 1101 + struct file *dentry_open_nonotify(const struct path *path, int flags, 1102 + const struct cred *cred) 1103 + { 1104 + struct file *f = alloc_empty_file(flags, cred); 1105 + if (!IS_ERR(f)) { 1106 + int error; 1107 + 1108 + f->f_mode |= FMODE_NONOTIFY; 1109 + error = vfs_open(path, f); 1110 + if (error) { 1111 + fput(f); 1112 + f = ERR_PTR(error); 1113 + } 1114 + } 1115 + return f; 1116 + } 1117 + 1118 1118 /** 1119 1119 * dentry_create - Create and open a file 1120 1120 * @path: path to create ··· 1229 1195 inline int build_open_flags(const struct open_how *how, struct open_flags *op) 1230 1196 { 1231 1197 u64 flags = how->flags; 1232 - u64 strip = __FMODE_NONOTIFY | O_CLOEXEC; 1198 + u64 strip = O_CLOEXEC; 1233 1199 int lookup_flags = 0; 1234 1200 int acc_mode = ACC_MODE(flags); 1235 1201 ··· 1237 1203 "struct open_flags doesn't yet handle flags > 32 bits"); 1238 1204 1239 1205 /* 1240 - * Strip flags that either shouldn't be set by userspace like 1241 - * FMODE_NONOTIFY or that aren't relevant in determining struct 1242 - * open_flags like O_CLOEXEC. 1206 + * Strip flags that aren't relevant in determining struct open_flags. 1243 1207 */ 1244 1208 flags &= ~strip; 1245 1209
+13
fs/xfs/xfs_file.c
··· 1451 1451 1452 1452 trace_xfs_read_fault(ip, order); 1453 1453 1454 + ret = filemap_fsnotify_fault(vmf); 1455 + if (unlikely(ret)) 1456 + return ret; 1454 1457 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1455 1458 ret = xfs_dax_fault_locked(vmf, order, false); 1456 1459 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); ··· 1482 1479 vm_fault_t ret; 1483 1480 1484 1481 trace_xfs_write_fault(ip, order); 1482 + /* 1483 + * Usually we get here from ->page_mkwrite callback but in case of DAX 1484 + * we will get here also for ordinary write fault. Handle HSM 1485 + * notifications for that case. 1486 + */ 1487 + if (IS_DAX(inode)) { 1488 + ret = filemap_fsnotify_fault(vmf); 1489 + if (unlikely(ret)) 1490 + return ret; 1491 + } 1485 1492 1486 1493 sb_start_pagefault(inode->i_sb); 1487 1494 file_update_time(vmf->vma->vm_file);
+1 -1
fs/xfs/xfs_super.c
··· 1756 1756 sb->s_time_max = XFS_LEGACY_TIME_MAX; 1757 1757 } 1758 1758 trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max); 1759 - sb->s_iflags |= SB_I_CGROUPWB; 1759 + sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM; 1760 1760 1761 1761 set_posix_acl_flag(sb); 1762 1762
+13 -5
include/linux/fanotify.h
··· 89 89 #define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE | \ 90 90 FAN_RENAME) 91 91 92 + /* Content events can be used to inspect file content */ 93 + #define FANOTIFY_CONTENT_PERM_EVENTS (FAN_OPEN_PERM | FAN_OPEN_EXEC_PERM | \ 94 + FAN_ACCESS_PERM) 95 + /* Pre-content events can be used to fill file content */ 96 + #define FANOTIFY_PRE_CONTENT_EVENTS (FAN_PRE_ACCESS) 97 + 98 + /* Events that require a permission response from user */ 99 + #define FANOTIFY_PERM_EVENTS (FANOTIFY_CONTENT_PERM_EVENTS | \ 100 + FANOTIFY_PRE_CONTENT_EVENTS) 101 + 92 102 /* Events that can be reported with event->fd */ 93 103 #define FANOTIFY_FD_EVENTS (FANOTIFY_PATH_EVENTS | FANOTIFY_PERM_EVENTS) 94 104 ··· 113 103 #define FANOTIFY_EVENTS (FANOTIFY_PATH_EVENTS | \ 114 104 FANOTIFY_INODE_EVENTS | \ 115 105 FANOTIFY_ERROR_EVENTS) 116 - 117 - /* Events that require a permission response from user */ 118 - #define FANOTIFY_PERM_EVENTS (FAN_OPEN_PERM | FAN_ACCESS_PERM | \ 119 - FAN_OPEN_EXEC_PERM) 120 106 121 107 /* Extra flags that may be reported with event or control handling of events */ 122 108 #define FANOTIFY_EVENT_FLAGS (FAN_EVENT_ON_CHILD | FAN_ONDIR) ··· 132 126 /* These masks check for invalid bits in permission responses. */ 133 127 #define FANOTIFY_RESPONSE_ACCESS (FAN_ALLOW | FAN_DENY) 134 128 #define FANOTIFY_RESPONSE_FLAGS (FAN_AUDIT | FAN_INFO) 135 - #define FANOTIFY_RESPONSE_VALID_MASK (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS) 129 + #define FANOTIFY_RESPONSE_VALID_MASK \ 130 + (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS | \ 131 + (FAN_ERRNO_MASK << FAN_ERRNO_SHIFT)) 136 132 137 133 /* Do not use these old uapi constants internally */ 138 134 #undef FAN_ALL_CLASS_BITS
+64 -8
include/linux/fs.h
··· 173 173 174 174 #define FMODE_NOREUSE ((__force fmode_t)(1 << 23)) 175 175 176 - /* FMODE_* bit 24 */ 177 - 178 176 /* File is embedded in backing_file object */ 179 - #define FMODE_BACKING ((__force fmode_t)(1 << 25)) 177 + #define FMODE_BACKING ((__force fmode_t)(1 << 24)) 180 178 181 - /* File was opened by fanotify and shouldn't generate fanotify events */ 182 - #define FMODE_NONOTIFY ((__force fmode_t)(1 << 26)) 179 + /* 180 + * Together with FMODE_NONOTIFY_PERM defines which fsnotify events shouldn't be 181 + * generated (see below) 182 + */ 183 + #define FMODE_NONOTIFY ((__force fmode_t)(1 << 25)) 184 + 185 + /* 186 + * Together with FMODE_NONOTIFY defines which fsnotify events shouldn't be 187 + * generated (see below) 188 + */ 189 + #define FMODE_NONOTIFY_PERM ((__force fmode_t)(1 << 26)) 183 190 184 191 /* File is capable of returning -EAGAIN if I/O will block */ 185 192 #define FMODE_NOWAIT ((__force fmode_t)(1 << 27)) ··· 196 189 197 190 /* File does not contribute to nr_files count */ 198 191 #define FMODE_NOACCOUNT ((__force fmode_t)(1 << 29)) 192 + 193 + /* 194 + * The two FMODE_NONOTIFY* define which fsnotify events should not be generated 195 + * for a file. These are the possible values of (f->f_mode & 196 + * FMODE_FSNOTIFY_MASK) and their meaning: 197 + * 198 + * FMODE_NONOTIFY - suppress all (incl. non-permission) events. 199 + * FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events. 200 + * FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only pre-content events. 201 + */ 202 + #define FMODE_FSNOTIFY_MASK \ 203 + (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM) 204 + 205 + #define FMODE_FSNOTIFY_NONE(mode) \ 206 + ((mode & FMODE_FSNOTIFY_MASK) == FMODE_NONOTIFY) 207 + #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 208 + #define FMODE_FSNOTIFY_PERM(mode) \ 209 + ((mode & FMODE_FSNOTIFY_MASK) == 0 || \ 210 + (mode & FMODE_FSNOTIFY_MASK) == (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)) 211 + #define FMODE_FSNOTIFY_HSM(mode) \ 212 + ((mode & FMODE_FSNOTIFY_MASK) == 0) 213 + #else 214 + #define FMODE_FSNOTIFY_PERM(mode) 0 215 + #define FMODE_FSNOTIFY_HSM(mode) 0 216 + #endif 217 + 199 218 200 219 /* 201 220 * Attribute flags. These should be or-ed together to figure out what ··· 1279 1246 #define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ 1280 1247 #define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ 1281 1248 #define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ 1249 + #define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ 1282 1250 1283 1251 /* Possible states of 'frozen' field */ 1284 1252 enum { ··· 2801 2767 } 2802 2768 struct file *dentry_open(const struct path *path, int flags, 2803 2769 const struct cred *creds); 2770 + struct file *dentry_open_nonotify(const struct path *path, int flags, 2771 + const struct cred *cred); 2804 2772 struct file *dentry_create(const struct path *path, int flags, umode_t mode, 2805 2773 const struct cred *cred); 2806 2774 struct path *backing_file_user_path(struct file *f); ··· 3111 3075 if (file) 3112 3076 atomic_inc(&file_inode(file)->i_writecount); 3113 3077 } 3078 + 3079 + /* 3080 + * Do not prevent write to executable file when watched by pre-content events. 3081 + * 3082 + * Note that FMODE_FSNOTIFY_HSM mode is set depending on pre-content watches at 3083 + * the time of file open and remains constant for entire lifetime of the file, 3084 + * so if pre-content watches are added post execution or removed before the end 3085 + * of the execution, it will not cause i_writecount reference leak. 3086 + */ 3087 + static inline int exe_file_deny_write_access(struct file *exe_file) 3088 + { 3089 + if (unlikely(FMODE_FSNOTIFY_HSM(exe_file->f_mode))) 3090 + return 0; 3091 + return deny_write_access(exe_file); 3092 + } 3093 + static inline void exe_file_allow_write_access(struct file *exe_file) 3094 + { 3095 + if (unlikely(!exe_file || FMODE_FSNOTIFY_HSM(exe_file->f_mode))) 3096 + return; 3097 + allow_write_access(exe_file); 3098 + } 3099 + 3114 3100 static inline bool inode_is_open_for_write(const struct inode *inode) 3115 3101 { 3116 3102 return atomic_read(&inode->i_writecount) > 0; ··· 3788 3730 int __init list_bdev_fs_names(char *buf, size_t size); 3789 3731 3790 3732 #define __FMODE_EXEC ((__force int) FMODE_EXEC) 3791 - #define __FMODE_NONOTIFY ((__force int) FMODE_NONOTIFY) 3792 3733 3793 3734 #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) 3794 - #define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \ 3795 - (flag & __FMODE_NONOTIFY))) 3735 + #define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE)) 3796 3736 3797 3737 static inline bool is_sxid(umode_t mode) 3798 3738 {
+61 -17
include/linux/fsnotify.h
··· 108 108 fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY); 109 109 } 110 110 111 + static inline int fsnotify_path(const struct path *path, __u32 mask) 112 + { 113 + return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); 114 + } 115 + 111 116 static inline int fsnotify_file(struct file *file, __u32 mask) 112 117 { 113 - const struct path *path; 114 - 115 118 /* 116 119 * FMODE_NONOTIFY are fds generated by fanotify itself which should not 117 120 * generate new events. We also don't want to generate events for 118 121 * FMODE_PATH fds (involves open & close events) as they are just 119 122 * handle creation / destruction events and not "real" file events. 120 123 */ 121 - if (file->f_mode & (FMODE_NONOTIFY | FMODE_PATH)) 124 + if (FMODE_FSNOTIFY_NONE(file->f_mode)) 122 125 return 0; 123 126 124 - path = &file->f_path; 125 - /* Permission events require group prio >= FSNOTIFY_PRIO_CONTENT */ 126 - if (mask & ALL_FSNOTIFY_PERM_EVENTS && 127 - !fsnotify_sb_has_priority_watchers(path->dentry->d_sb, 128 - FSNOTIFY_PRIO_CONTENT)) 129 - return 0; 130 - 131 - return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); 127 + return fsnotify_path(&file->f_path, mask); 132 128 } 133 129 134 130 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 131 + 132 + void file_set_fsnotify_mode(struct file *file); 133 + 135 134 /* 136 135 * fsnotify_file_area_perm - permission hook before access to file range 137 136 */ 138 137 static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, 139 138 const loff_t *ppos, size_t count) 140 139 { 141 - __u32 fsnotify_mask = FS_ACCESS_PERM; 142 - 143 140 /* 144 141 * filesystem may be modified in the context of permission events 145 142 * (e.g. by HSM filling a file on access), so sb freeze protection ··· 144 147 */ 145 148 lockdep_assert_once(file_write_not_started(file)); 146 149 150 + if (!(perm_mask & (MAY_READ | MAY_WRITE | MAY_ACCESS))) 151 + return 0; 152 + 153 + if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode))) 154 + return 0; 155 + 156 + /* 157 + * read()/write() and other types of access generate pre-content events. 158 + */ 159 + if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { 160 + int ret = fsnotify_pre_content(&file->f_path, ppos, count); 161 + 162 + if (ret) 163 + return ret; 164 + } 165 + 147 166 if (!(perm_mask & MAY_READ)) 148 167 return 0; 149 168 150 - return fsnotify_file(file, fsnotify_mask); 169 + /* 170 + * read() also generates the legacy FS_ACCESS_PERM event, so content 171 + * scanners can inspect the content filled by pre-content event. 172 + */ 173 + return fsnotify_path(&file->f_path, FS_ACCESS_PERM); 151 174 } 152 175 153 176 /* 154 - * fsnotify_file_perm - permission hook before file access 177 + * fsnotify_truncate_perm - permission hook before file truncate 178 + */ 179 + static inline int fsnotify_truncate_perm(const struct path *path, loff_t length) 180 + { 181 + struct inode *inode = d_inode(path->dentry); 182 + 183 + if (!(inode->i_sb->s_iflags & SB_I_ALLOW_HSM) || 184 + !fsnotify_sb_has_priority_watchers(inode->i_sb, 185 + FSNOTIFY_PRIO_PRE_CONTENT)) 186 + return 0; 187 + 188 + return fsnotify_pre_content(path, &length, 0); 189 + } 190 + 191 + /* 192 + * fsnotify_file_perm - permission hook before file access (unknown range) 155 193 */ 156 194 static inline int fsnotify_file_perm(struct file *file, int perm_mask) 157 195 { ··· 200 168 { 201 169 int ret; 202 170 171 + if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode))) 172 + return 0; 173 + 203 174 if (file->f_flags & __FMODE_EXEC) { 204 - ret = fsnotify_file(file, FS_OPEN_EXEC_PERM); 175 + ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM); 205 176 if (ret) 206 177 return ret; 207 178 } 208 179 209 - return fsnotify_file(file, FS_OPEN_PERM); 180 + return fsnotify_path(&file->f_path, FS_OPEN_PERM); 210 181 } 211 182 212 183 #else 184 + static inline void file_set_fsnotify_mode(struct file *file) 185 + { 186 + } 187 + 213 188 static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, 214 189 const loff_t *ppos, size_t count) 190 + { 191 + return 0; 192 + } 193 + 194 + static inline int fsnotify_truncate_perm(const struct path *path, loff_t length) 215 195 { 216 196 return 0; 217 197 }
+51 -2
include/linux/fsnotify_backend.h
··· 55 55 #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ 56 56 #define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */ 57 57 #define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */ 58 + /* #define FS_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */ 59 + 60 + #define FS_PRE_ACCESS 0x00100000 /* Pre-content access hook */ 58 61 59 62 /* 60 63 * Set on inode mark that cares about things that happen to its children. ··· 80 77 */ 81 78 #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME) 82 79 83 - #define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \ 84 - FS_OPEN_EXEC_PERM) 80 + /* Content events can be used to inspect file content */ 81 + #define FSNOTIFY_CONTENT_PERM_EVENTS (FS_OPEN_PERM | FS_OPEN_EXEC_PERM | \ 82 + FS_ACCESS_PERM) 83 + /* Pre-content events can be used to fill file content */ 84 + #define FSNOTIFY_PRE_CONTENT_EVENTS (FS_PRE_ACCESS) 85 + 86 + #define ALL_FSNOTIFY_PERM_EVENTS (FSNOTIFY_CONTENT_PERM_EVENTS | \ 87 + FSNOTIFY_PRE_CONTENT_EVENTS) 85 88 86 89 /* 87 90 * This is a list of all events that may get sent to a parent that is watching ··· 294 285 /* When calling fsnotify tell it if the data is a path or inode */ 295 286 enum fsnotify_data_type { 296 287 FSNOTIFY_EVENT_NONE, 288 + FSNOTIFY_EVENT_FILE_RANGE, 297 289 FSNOTIFY_EVENT_PATH, 298 290 FSNOTIFY_EVENT_INODE, 299 291 FSNOTIFY_EVENT_DENTRY, ··· 307 297 struct super_block *sb; 308 298 }; 309 299 300 + struct file_range { 301 + const struct path *path; 302 + loff_t pos; 303 + size_t count; 304 + }; 305 + 306 + static inline const struct path *file_range_path(const struct file_range *range) 307 + { 308 + return range->path; 309 + } 310 + 310 311 static inline struct inode *fsnotify_data_inode(const void *data, int data_type) 311 312 { 312 313 switch (data_type) { ··· 327 306 return d_inode(data); 328 307 case FSNOTIFY_EVENT_PATH: 329 308 return d_inode(((const struct path *)data)->dentry); 309 + case FSNOTIFY_EVENT_FILE_RANGE: 310 + return d_inode(file_range_path(data)->dentry); 330 311 case FSNOTIFY_EVENT_ERROR: 331 312 return ((struct fs_error_report *)data)->inode; 332 313 default: ··· 344 321 return (struct dentry *)data; 345 322 case FSNOTIFY_EVENT_PATH: 346 323 return ((const struct path *)data)->dentry; 324 + case FSNOTIFY_EVENT_FILE_RANGE: 325 + return file_range_path(data)->dentry; 347 326 default: 348 327 return NULL; 349 328 } ··· 357 332 switch (data_type) { 358 333 case FSNOTIFY_EVENT_PATH: 359 334 return data; 335 + case FSNOTIFY_EVENT_FILE_RANGE: 336 + return file_range_path(data); 360 337 default: 361 338 return NULL; 362 339 } ··· 374 347 return ((struct dentry *)data)->d_sb; 375 348 case FSNOTIFY_EVENT_PATH: 376 349 return ((const struct path *)data)->dentry->d_sb; 350 + case FSNOTIFY_EVENT_FILE_RANGE: 351 + return file_range_path(data)->dentry->d_sb; 377 352 case FSNOTIFY_EVENT_ERROR: 378 353 return ((struct fs_error_report *) data)->sb; 379 354 default: ··· 390 361 switch (data_type) { 391 362 case FSNOTIFY_EVENT_ERROR: 392 363 return (struct fs_error_report *) data; 364 + default: 365 + return NULL; 366 + } 367 + } 368 + 369 + static inline const struct file_range *fsnotify_data_file_range( 370 + const void *data, 371 + int data_type) 372 + { 373 + switch (data_type) { 374 + case FSNOTIFY_EVENT_FILE_RANGE: 375 + return (struct file_range *)data; 393 376 default: 394 377 return NULL; 395 378 } ··· 895 854 { 896 855 INIT_LIST_HEAD(&event->list); 897 856 } 857 + int fsnotify_pre_content(const struct path *path, const loff_t *ppos, 858 + size_t count); 898 859 899 860 #else 861 + 862 + static inline int fsnotify_pre_content(const struct path *path, 863 + const loff_t *ppos, size_t count) 864 + { 865 + return 0; 866 + } 900 867 901 868 static inline int fsnotify(__u32 mask, const void *data, int data_type, 902 869 struct inode *dir, const struct qstr *name,
+1
include/linux/mm.h
··· 3431 3431 extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, 3432 3432 pgoff_t start_pgoff, pgoff_t end_pgoff); 3433 3433 extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); 3434 + extern vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf); 3434 3435 3435 3436 extern unsigned long stack_guard_gap; 3436 3437 /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
-1
include/uapi/asm-generic/fcntl.h
··· 6 6 7 7 /* 8 8 * FMODE_EXEC is 0x20 9 - * FMODE_NONOTIFY is 0x4000000 10 9 * These cannot be used by userspace O_* until internal and external open 11 10 * flags are split. 12 11 * -Eric Paris
+18
include/uapi/linux/fanotify.h
··· 25 25 #define FAN_OPEN_PERM 0x00010000 /* File open in perm check */ 26 26 #define FAN_ACCESS_PERM 0x00020000 /* File accessed in perm check */ 27 27 #define FAN_OPEN_EXEC_PERM 0x00040000 /* File open/exec in perm check */ 28 + /* #define FAN_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */ 29 + 30 + #define FAN_PRE_ACCESS 0x00100000 /* Pre-content access hook */ 28 31 29 32 #define FAN_EVENT_ON_CHILD 0x08000000 /* Interested in child events */ 30 33 ··· 146 143 #define FAN_EVENT_INFO_TYPE_DFID 3 147 144 #define FAN_EVENT_INFO_TYPE_PIDFD 4 148 145 #define FAN_EVENT_INFO_TYPE_ERROR 5 146 + #define FAN_EVENT_INFO_TYPE_RANGE 6 149 147 150 148 /* Special info types for FAN_RENAME */ 151 149 #define FAN_EVENT_INFO_TYPE_OLD_DFID_NAME 10 ··· 193 189 __u32 error_count; 194 190 }; 195 191 192 + struct fanotify_event_info_range { 193 + struct fanotify_event_info_header hdr; 194 + __u32 pad; 195 + __u64 offset; 196 + __u64 count; 197 + }; 198 + 196 199 /* 197 200 * User space may need to record additional information about its decision. 198 201 * The extra information type records what kind of information is included. ··· 235 224 /* Legit userspace responses to a _PERM event */ 236 225 #define FAN_ALLOW 0x01 237 226 #define FAN_DENY 0x02 227 + /* errno other than EPERM can specified in upper byte of deny response */ 228 + #define FAN_ERRNO_BITS 8 229 + #define FAN_ERRNO_SHIFT (32 - FAN_ERRNO_BITS) 230 + #define FAN_ERRNO_MASK ((1 << FAN_ERRNO_BITS) - 1) 231 + #define FAN_DENY_ERRNO(err) \ 232 + (FAN_DENY | ((((__u32)(err)) & FAN_ERRNO_MASK) << FAN_ERRNO_SHIFT)) 233 + 238 234 #define FAN_AUDIT 0x10 /* Bitmask to create audit record for result */ 239 235 #define FAN_INFO 0x20 /* Bitmask to indicate additional information */ 240 236
+6 -6
kernel/fork.c
··· 625 625 * We depend on the oldmm having properly denied write access to the 626 626 * exe_file already. 627 627 */ 628 - if (exe_file && deny_write_access(exe_file)) 629 - pr_warn_once("deny_write_access() failed in %s\n", __func__); 628 + if (exe_file && exe_file_deny_write_access(exe_file)) 629 + pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__); 630 630 } 631 631 632 632 #ifdef CONFIG_MMU ··· 1416 1416 * We expect the caller (i.e., sys_execve) to already denied 1417 1417 * write access, so this is unlikely to fail. 1418 1418 */ 1419 - if (unlikely(deny_write_access(new_exe_file))) 1419 + if (unlikely(exe_file_deny_write_access(new_exe_file))) 1420 1420 return -EACCES; 1421 1421 get_file(new_exe_file); 1422 1422 } 1423 1423 rcu_assign_pointer(mm->exe_file, new_exe_file); 1424 1424 if (old_exe_file) { 1425 - allow_write_access(old_exe_file); 1425 + exe_file_allow_write_access(old_exe_file); 1426 1426 fput(old_exe_file); 1427 1427 } 1428 1428 return 0; ··· 1463 1463 return ret; 1464 1464 } 1465 1465 1466 - ret = deny_write_access(new_exe_file); 1466 + ret = exe_file_deny_write_access(new_exe_file); 1467 1467 if (ret) 1468 1468 return -EACCES; 1469 1469 get_file(new_exe_file); ··· 1475 1475 mmap_write_unlock(mm); 1476 1476 1477 1477 if (old_exe_file) { 1478 - allow_write_access(old_exe_file); 1478 + exe_file_allow_write_access(old_exe_file); 1479 1479 fput(old_exe_file); 1480 1480 } 1481 1481 return 0;
+86
mm/filemap.c
··· 47 47 #include <linux/splice.h> 48 48 #include <linux/rcupdate_wait.h> 49 49 #include <linux/sched/mm.h> 50 + #include <linux/fsnotify.h> 50 51 #include <asm/pgalloc.h> 51 52 #include <asm/tlbflush.h> 52 53 #include "internal.h" ··· 3142 3141 unsigned long vm_flags = vmf->vma->vm_flags; 3143 3142 unsigned int mmap_miss; 3144 3143 3144 + /* 3145 + * If we have pre-content watches we need to disable readahead to make 3146 + * sure that we don't populate our mapping with 0 filled pages that we 3147 + * never emitted an event for. 3148 + */ 3149 + if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) 3150 + return fpin; 3151 + 3145 3152 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3146 3153 /* Use the readahead code, even if readahead is disabled */ 3147 3154 if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) { ··· 3218 3209 struct file *fpin = NULL; 3219 3210 unsigned int mmap_miss; 3220 3211 3212 + /* See comment in do_sync_mmap_readahead. */ 3213 + if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) 3214 + return fpin; 3215 + 3221 3216 /* If we don't want any read-ahead, don't bother */ 3222 3217 if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) 3223 3218 return fpin; ··· 3279 3266 pte_unmap(ptep); 3280 3267 return ret; 3281 3268 } 3269 + 3270 + /** 3271 + * filemap_fsnotify_fault - maybe emit a pre-content event. 3272 + * @vmf: struct vm_fault containing details of the fault. 3273 + * 3274 + * If we have a pre-content watch on this file we will emit an event for this 3275 + * range. If we return anything the fault caller should return immediately, we 3276 + * will return VM_FAULT_RETRY if we had to emit an event, which will trigger the 3277 + * fault again and then the fault handler will run the second time through. 3278 + * 3279 + * Return: a bitwise-OR of %VM_FAULT_ codes, 0 if nothing happened. 3280 + */ 3281 + vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf) 3282 + { 3283 + struct file *fpin = NULL; 3284 + int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS; 3285 + loff_t pos = vmf->pgoff >> PAGE_SHIFT; 3286 + size_t count = PAGE_SIZE; 3287 + int err; 3288 + 3289 + /* 3290 + * We already did this and now we're retrying with everything locked, 3291 + * don't emit the event and continue. 3292 + */ 3293 + if (vmf->flags & FAULT_FLAG_TRIED) 3294 + return 0; 3295 + 3296 + /* No watches, we're done. */ 3297 + if (likely(!FMODE_FSNOTIFY_HSM(vmf->vma->vm_file->f_mode))) 3298 + return 0; 3299 + 3300 + fpin = maybe_unlock_mmap_for_io(vmf, fpin); 3301 + if (!fpin) 3302 + return VM_FAULT_SIGBUS; 3303 + 3304 + err = fsnotify_file_area_perm(fpin, mask, &pos, count); 3305 + fput(fpin); 3306 + if (err) 3307 + return VM_FAULT_SIGBUS; 3308 + return VM_FAULT_RETRY; 3309 + } 3310 + EXPORT_SYMBOL_GPL(filemap_fsnotify_fault); 3282 3311 3283 3312 /** 3284 3313 * filemap_fault - read in file data for page fault handling ··· 3425 3370 * or because readahead was otherwise unable to retrieve it. 3426 3371 */ 3427 3372 if (unlikely(!folio_test_uptodate(folio))) { 3373 + /* 3374 + * If this is a precontent file we have can now emit an event to 3375 + * try and populate the folio. 3376 + */ 3377 + if (!(vmf->flags & FAULT_FLAG_TRIED) && 3378 + unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { 3379 + loff_t pos = folio_pos(folio); 3380 + size_t count = folio_size(folio); 3381 + 3382 + /* We're NOWAIT, we have to retry. */ 3383 + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) { 3384 + folio_unlock(folio); 3385 + goto out_retry; 3386 + } 3387 + 3388 + if (mapping_locked) 3389 + filemap_invalidate_unlock_shared(mapping); 3390 + mapping_locked = false; 3391 + 3392 + folio_unlock(folio); 3393 + fpin = maybe_unlock_mmap_for_io(vmf, fpin); 3394 + if (!fpin) 3395 + goto out_retry; 3396 + 3397 + error = fsnotify_file_area_perm(fpin, MAY_ACCESS, &pos, 3398 + count); 3399 + if (error) 3400 + ret = VM_FAULT_SIGBUS; 3401 + goto out_retry; 3402 + } 3403 + 3428 3404 /* 3429 3405 * If the invalidate lock is not held, the folio was in cache 3430 3406 * and uptodate and now it is not. Strange but possible since we
+19
mm/memory.c
··· 76 76 #include <linux/ptrace.h> 77 77 #include <linux/vmalloc.h> 78 78 #include <linux/sched/sysctl.h> 79 + #include <linux/fsnotify.h> 79 80 80 81 #include <trace/events/kmem.h> 81 82 ··· 5663 5662 static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) 5664 5663 { 5665 5664 struct vm_area_struct *vma = vmf->vma; 5665 + 5666 5666 if (vma_is_anonymous(vma)) 5667 5667 return do_huge_pmd_anonymous_page(vmf); 5668 + /* 5669 + * Currently we just emit PAGE_SIZE for our fault events, so don't allow 5670 + * a huge fault if we have a pre content watch on this file. This would 5671 + * be trivial to support, but there would need to be tests to ensure 5672 + * this works properly and those don't exist currently. 5673 + */ 5674 + if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) 5675 + return VM_FAULT_FALLBACK; 5668 5676 if (vma->vm_ops->huge_fault) 5669 5677 return vma->vm_ops->huge_fault(vmf, PMD_ORDER); 5670 5678 return VM_FAULT_FALLBACK; ··· 5697 5687 } 5698 5688 5699 5689 if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { 5690 + /* See comment in create_huge_pmd. */ 5691 + if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) 5692 + goto split; 5700 5693 if (vma->vm_ops->huge_fault) { 5701 5694 ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER); 5702 5695 if (!(ret & VM_FAULT_FALLBACK)) ··· 5722 5709 /* No support for anonymous transparent PUD pages yet */ 5723 5710 if (vma_is_anonymous(vma)) 5724 5711 return VM_FAULT_FALLBACK; 5712 + /* See comment in create_huge_pmd. */ 5713 + if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) 5714 + return VM_FAULT_FALLBACK; 5725 5715 if (vma->vm_ops->huge_fault) 5726 5716 return vma->vm_ops->huge_fault(vmf, PUD_ORDER); 5727 5717 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ ··· 5742 5726 if (vma_is_anonymous(vma)) 5743 5727 goto split; 5744 5728 if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { 5729 + /* See comment in create_huge_pmd. */ 5730 + if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) 5731 + goto split; 5745 5732 if (vma->vm_ops->huge_fault) { 5746 5733 ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER); 5747 5734 if (!(ret & VM_FAULT_FALLBACK))
+7
mm/nommu.c
··· 1613 1613 } 1614 1614 EXPORT_SYMBOL(remap_vmalloc_range); 1615 1615 1616 + vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf) 1617 + { 1618 + BUG(); 1619 + return 0; 1620 + } 1621 + EXPORT_SYMBOL_GPL(filemap_fsnotify_fault); 1622 + 1616 1623 vm_fault_t filemap_fault(struct vm_fault *vmf) 1617 1624 { 1618 1625 BUG();
+14
mm/readahead.c
··· 128 128 #include <linux/blk-cgroup.h> 129 129 #include <linux/fadvise.h> 130 130 #include <linux/sched/mm.h> 131 + #include <linux/fsnotify.h> 131 132 132 133 #include "internal.h" 133 134 ··· 550 549 pgoff_t prev_index, miss; 551 550 552 551 /* 552 + * If we have pre-content watches we need to disable readahead to make 553 + * sure that we don't find 0 filled pages in cache that we never emitted 554 + * events for. Filesystems supporting HSM must make sure to not call 555 + * this function with ractl->file unset for files handled by HSM. 556 + */ 557 + if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode))) 558 + return; 559 + 560 + /* 553 561 * Even if readahead is disabled, issue this request as readahead 554 562 * as we'll need it to satisfy the requested range. The forced 555 563 * readahead will do the right thing and limit the read to just the ··· 634 624 635 625 /* no readahead */ 636 626 if (!ra->ra_pages) 627 + return; 628 + 629 + /* See the comment in page_cache_sync_ra. */ 630 + if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode))) 637 631 return; 638 632 639 633 /*
+2 -1
security/selinux/hooks.c
··· 3404 3404 perm |= FILE__WATCH_WITH_PERM; 3405 3405 3406 3406 /* watches on read-like events need the file:watch_reads permission */ 3407 - if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_CLOSE_NOWRITE)) 3407 + if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_PRE_ACCESS | 3408 + FS_CLOSE_NOWRITE)) 3408 3409 perm |= FILE__WATCH_READS; 3409 3410 3410 3411 return path_has_perm(current_cred(), path, perm);